Files
cs249r_book/interviews/vault/scripts/fix_yaml_hygiene.py
Vijay Janapa Reddi 90b2abd178 feat(vault): add semantic-audit pipeline for question corpus QA
Adds the deterministic and semantic audit tooling used to drive the
release-readiness pass on the YAML question corpus:

- audit_yaml_corpus.py        — read-only schema + authoring-convention audit
- format_yaml_questions.py    — canonical formatter (idempotent)
- fix_yaml_hygiene.py         — bulk hygiene fixups
- prepare_semantic_review_queue.py — emit JSONL queues per track for LLM review
- semantic_audit_questions.py — parallel LLM audit runner (gpt-5.4-mini)
- run_semantic_audit_tracks.py — per-track orchestrator wrapping the runner
- build_semantic_fix_queue.py — collect findings into a prioritized fix queue
- compare_semantic_passes.py  — diff two semantic-audit passes for stability
- summarize_semantic_audit.py — markdown summary from findings JSONL

Also adds interviews/vault/audit/README.md describing the workflow.

Audit output artifacts (semantic-review-queue/, semantic-review-results/,
fresh-yaml-audit/) are produced by these scripts on demand and remain
untracked.
2026-05-05 09:08:56 -04:00

205 lines
12 KiB
Python

#!/usr/bin/env python3
"""Apply conservative mechanical hygiene fixes to question YAML files."""
from __future__ import annotations
import argparse
import re
from pathlib import Path
from typing import Any
import yaml
VAULT_DIR = Path(__file__).resolve().parents[1]
QUESTIONS_DIR = VAULT_DIR / "questions"
CODE_SPAN_RE = re.compile(r"`([^`]+)`")
MATH_SPAN_RE = re.compile(r"\$([^$]+)\$")
HTML_TAG_RE = re.compile(r"<([^>]+)>")
QUESTION_REWRITES = {
"cloud-2864": "What is the maximum concurrent request count for 8192-token requests with 80 layers, 8 KV heads, and head_dim 128?",
"cloud-3808": "How does block-sparse GEMM remove capacity padding, and for batch 512, 8 experts, and capacity factor 1.5, what padding remains?",
"cloud-1121": "Under strict Demographic Parity with the same total daily accepts, how many Group Y candidates are accepted and how many are historically unqualified?",
"cloud-1607": "Which mitigation best prevents memorization under the compute budget: deduplication, DP-SGD, or RLHF refusal guardrails?",
"cloud-1914": "What causes high CPU I/O wait and low page-cache hits when PyTorch shuffles individual files, and how would you fix it?",
"cloud-3544": "How would you compare centralized and distributed edge labeling for 10M images in 3 months while preserving IAA and limiting bias?",
"cloud-1854": "With 12ms base latency and 1ms per extra batched request, what maximum batch window supports batch size 16?",
"cloud-4405": "How would you design disaggregated prefill/decode to meet p99 TTFT under 500ms and p99 generation under 4s including KV transfer?",
"cloud-3285": "How do host-device data movement, DMA, zero-copy, and pinned memory affect latency and throughput in the two architectures?",
"cloud-4508": "What is speculative decoding latency per accepted token, and below what acceptance rate does it underperform plain decoding?",
"cloud-2734": "For 2,000 intra-rack 800G 2m links, should you choose active copper, optics, or rack redesign for 3-year TCO?",
"cloud-4168": "How do NVLink and Infinity Fabric differ for tensor parallelism, and which is better for the given shard and AllReduce sizes?",
"cloud-0983": "With 4 gradient buckets, perfect CUDA-stream overlap, and no scheduling overhead, what is the new backward-plus-sync time?",
"cloud-1447": "How many A100 GPU-hours does a 0.5-hour shift-left validation step save per successfully deployed model?",
"cloud-1241": "At 32MB/ms H2D bandwidth, what latency do 1,000 requests need in naive sync versus pipelined pinned-memory CUDA streams?",
"cloud-4055": "How does 1GB AllReduce time on the torus compare with 256 H100s on 400Gbps Ethernet, and why does topology matter?",
"cloud-1207": "Why is 60s nvidia-smi power insufficient for INT8 versus FP16 rollout, and what energy metric should guide deployment?",
"cloud-2770": "How would you compare classifier filtering, sandboxed tools with human review, and instruction hierarchy for prompt injection defense?",
"edge-1946": "How would you design hardware-aware NAS for Jetson Orin using SRAM, LPDDR5, INT8 FLOPs, and measured latency constraints?",
"edge-2295": "How would you design a paged KV manager for Orin and compute concurrent 1024-token sessions in 4GB of KV memory?",
"edge-1839": "How would you compare Cloud AI 100, Jetson Orin, and Intel Atom for smart-camera efficiency, programmability, and TCO?",
"edge-1843": "For a perfectly efficient weight-stationary systolic array, what is the theoretical minimum layer time in milliseconds?",
"edge-2370": "What TOPS and LPDDR5 bandwidth are required, is the system compute- or bandwidth-bound, and which Orin power mode suffices?",
"edge-2363": "Using M/D/1, what is mean frame response time, how does it compare with M/M/1, and what latency is saved?",
"edge-0974": "Where should weights be placed to minimize latency, and what is the flash-access time penalty per inference?",
"edge-1726": "When Edge TPU activations exceed SRAM, how would you choose among downsampling, CPU/TPU splitting, and receptive-field changes?",
"edge-2409": "Does camera DMA, model weight reads, or activation traffic starve the others under aggregate LPDDR5 utilization?",
"edge-1993": "How would you route mixed critical and batch requests across Hailo-8 accelerators under failures or thermal throttling?",
"edge-2284": "What is total broadcast time including bytes, DMA setup, and PCIe arbitration, and how much can pipelining save?",
"edge-1585": "What BTU/hr thermal dissipation is required for eight 75W Cloud AI 100 cards to avoid throttling?",
"edge-1376": "With 8 GB/s bandwidth and 32M parameters, what is the maximum inference rate for packed 4-bit weights?",
"edge-2432": "During the 5-second shutdown window, which subsystems must checkpoint and in what canonical order?",
"mobile-1605": "How do the two attention mechanisms trade off memory bandwidth, compute efficiency, and latency on this hardware?",
"mobile-1614": "How would hardware-aware NAS use TOPS, memory limits, and MCUNet-style constraints to find a mobile architecture?",
"mobile-2028": "What GFLOPs/s are required for nominal INT8 and mixed-precision fallback, and does fallback still hit 120 FPS?",
"mobile-2157": "Why does depthwise convolution lose L1 reuse and fall into the memory-bound region of the Roofline model?",
"mobile-1377": "How would you combine per-example clipping, secure noise, and INT8 quantization on a Hexagon NPU without weakening DP?",
"mobile-1751": "What TOPS, power, and memory budget support on-device demographic-parity monitoring without hurting user experience?",
"mobile-0903": "How would you design data curation, synchronization, and privacy-preserving collection for the target device?",
"mobile-1903": "From the 30.3 FPS baseline, what steady-state FPS does double-buffering unlock, and which stage remains binding?",
"mobile-1982": "From the 300ms naive estimate, what is the realistic worst-case queue drain time after cold-cache and arrival effects?",
"mobile-1917": "How would you pipeline UNet activations to avoid system-RAM spills when tensors peak at 120MB and NPU SRAM is 32MB?",
"mobile-1881": "What stage rate binds the Cloud-to-NPU load pipeline under a 64MB ring, and is the 3-minute SLA feasible?",
"mobile-1932": "What memory-savings factor comes from FP16-to-INT4 weights after accounting for the static KV cache?",
"mobile-2031": "How do FP16 and INT8 per-token decode latency compare on a contended 50GB/s LPDDR5 bus?",
"mobile-1891": "Can checkpointing fit in the 1.5s grace window after serialization, UFS contention, and CPU/I/O overlap?",
"tinyml-1389": "How would you estimate parameter count, INT8 memory footprint, and Cortex-M7 plus Ethos-U55 inference latency?",
"tinyml-1634": "Does a 5- or 15-minute checkpoint interval minimize 1-hour expected energy, and by what factor?",
"tinyml-1661": "Can the 2mF capacitor finish a 16KB checkpoint before brownout, and what capacitance would succeed?",
"cloud-1249": "Using Young's formula, what are the optimal checkpoint interval and daily checkpointing overhead?",
"edge-1119": "For a decode queue with lambda=8/s and mu=9.7/s, what is the correct wait time and why does caching matter?",
"edge-2280": "What maximum processor utilization rho keeps average wait time at or below 40ms?",
"edge-1549": "Which strategy yields higher sustained 24-hour throughput in a 40C sealed enclosure, and why?",
"global-0085": "Why are GPUDirect RDMA latency and bandwidth limited even when the GPU and NIC share a PCIe switch?",
"mobile-0136": "Which standard mobile SoC processor type is most energy-efficient for these operations?",
}
def clean_question_text(text: str) -> str:
cleaned = CODE_SPAN_RE.sub(r"\1", text)
cleaned = MATH_SPAN_RE.sub(r"\1", cleaned)
cleaned = HTML_TAG_RE.sub(r"\1", cleaned)
cleaned = cleaned.replace("\\times", "x")
cleaned = cleaned.replace("\\cdot", "x")
cleaned = cleaned.replace("\\approx", "approximately")
cleaned = cleaned.replace("\\lambda", "lambda")
cleaned = cleaned.replace("\\sigma", "sigma")
cleaned = cleaned.replace("\\alpha", "alpha")
cleaned = cleaned.replace("\\pm", "+/-")
cleaned = cleaned.replace("\\ge", ">=")
cleaned = cleaned.replace("\\le", "<=")
cleaned = cleaned.replace("\\_", "_")
cleaned = " ".join(cleaned.split())
return cleaned
def has_ordered_markers(text: str, markers: tuple[str, ...]) -> bool:
cursor = 0
for marker in markers:
idx = text.find(marker, cursor)
if idx < 0:
return False
cursor = idx + len(marker)
return True
def normalize_common_mistake(text: str) -> str:
stripped = text.strip()
if not stripped or has_ordered_markers(
stripped,
("**The Pitfall:**", "**The Rationale:**", "**The Consequence:**"),
):
return text
return (
f"**The Pitfall:** {stripped}\n"
"**The Rationale:** This mistake focuses on the surface symptom instead of the governing ML-systems constraint.\n"
"**The Consequence:** The resulting design, estimate, or diagnosis can miss the real bottleneck and lead to incorrect deployment decisions.\n"
)
def normalize_napkin_math(text: str) -> str:
stripped = text.strip()
if not stripped or has_ordered_markers(
stripped,
("**Assumptions", "**Calculations:**", "**Conclusion"),
):
return text
return (
"**Assumptions & Constraints:**\n"
"- Use the quantities and constraints stated in the scenario.\n\n"
"**Calculations:**\n"
f"{stripped}\n\n"
"**Conclusion & Interpretation:**\n"
"- The calculation identifies the limiting systems constraint for this question.\n"
)
def fix_record(data: dict[str, Any]) -> bool:
changed = False
if data.get("status") != "deleted" and "deletion_reason" in data:
data.pop("deletion_reason", None)
changed = True
question = data.get("question")
if isinstance(question, str):
qid = data.get("id")
cleaned = QUESTION_REWRITES.get(qid, clean_question_text(question))
if data.get("status") == "published" and "?" not in cleaned:
cleaned = cleaned.rstrip(".:;! ") + "?"
if cleaned != question:
data["question"] = cleaned
changed = True
details = data.get("details")
if isinstance(details, dict):
common_mistake = details.get("common_mistake")
if isinstance(common_mistake, str):
normalized = normalize_common_mistake(common_mistake)
if normalized != common_mistake:
details["common_mistake"] = normalized
changed = True
napkin_math = details.get("napkin_math")
if isinstance(napkin_math, str):
normalized = normalize_napkin_math(napkin_math)
if normalized != napkin_math:
details["napkin_math"] = normalized
changed = True
return changed
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--questions-dir", type=Path, default=QUESTIONS_DIR)
args = parser.parse_args()
changed_paths: list[Path] = []
for path in sorted(args.questions_dir.glob("*/*/*.yaml")):
data = yaml.safe_load(path.read_text())
if not isinstance(data, dict):
continue
if fix_record(data):
path.write_text(
yaml.safe_dump(
data,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=1000,
)
)
changed_paths.append(path)
print(f"Applied hygiene fixes to {len(changed_paths)} file(s).")
for path in changed_paths[:50]:
print(f"- {path.relative_to(VAULT_DIR.parents[1])}")
if len(changed_paths) > 50:
print(f"... and {len(changed_paths) - 50} more")
return 0
if __name__ == "__main__":
raise SystemExit(main())