Files
cs249r_book/interviews/vault/scripts/prepare_semantic_review_queue.py
Vijay Janapa Reddi 90b2abd178 feat(vault): add semantic-audit pipeline for question corpus QA
Adds the deterministic and semantic audit tooling used to drive the
release-readiness pass on the YAML question corpus:

- audit_yaml_corpus.py        — read-only schema + authoring-convention audit
- format_yaml_questions.py    — canonical formatter (idempotent)
- fix_yaml_hygiene.py         — bulk hygiene fixups
- prepare_semantic_review_queue.py — emit JSONL queues per track for LLM review
- semantic_audit_questions.py — parallel LLM audit runner (gpt-5.4-mini)
- run_semantic_audit_tracks.py — per-track orchestrator wrapping the runner
- build_semantic_fix_queue.py — collect findings into a prioritized fix queue
- compare_semantic_passes.py  — diff two semantic-audit passes for stability
- summarize_semantic_audit.py — markdown summary from findings JSONL

Also adds interviews/vault/audit/README.md describing the workflow.

Audit output artifacts (semantic-review-queue/, semantic-review-results/,
fresh-yaml-audit/) are produced by these scripts on demand and remain
untracked.
2026-05-05 09:08:56 -04:00

108 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""Build a semantic review queue for published StaffML questions."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
import yaml
VAULT_DIR = Path(__file__).resolve().parents[1]
REPO_ROOT = VAULT_DIR.parents[1]
QUESTIONS_DIR = VAULT_DIR / "questions"
DEFAULT_OUT = VAULT_DIR / "audit" / "semantic-review-queue"
def rel(path: Path) -> str:
return str(path.relative_to(REPO_ROOT))
def review_record(path: Path, data: dict[str, Any]) -> dict[str, Any]:
details = data.get("details") if isinstance(data.get("details"), dict) else {}
return {
"qid": data.get("id"),
"path": rel(path),
"track": data.get("track"),
"level": data.get("level"),
"zone": data.get("zone"),
"topic": data.get("topic"),
"competency_area": data.get("competency_area"),
"bloom_level": data.get("bloom_level"),
"phase": data.get("phase"),
"title": data.get("title"),
"scenario": data.get("scenario"),
"question": data.get("question"),
"realistic_solution": details.get("realistic_solution"),
"common_mistake": details.get("common_mistake"),
"napkin_math": details.get("napkin_math"),
"options": details.get("options"),
"correct_index": details.get("correct_index"),
}
def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w") as handle:
for row in rows:
handle.write(json.dumps(row, sort_keys=True) + "\n")
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--questions-dir", type=Path, default=QUESTIONS_DIR)
parser.add_argument("--out-dir", type=Path, default=DEFAULT_OUT)
parser.add_argument("--track", choices=["cloud", "edge", "mobile", "tinyml", "global"])
parser.add_argument("--batch-size", type=int, default=100)
args = parser.parse_args()
rows: list[dict[str, Any]] = []
for path in sorted(args.questions_dir.glob("*/*/*.yaml")):
data = yaml.safe_load(path.read_text())
if not isinstance(data, dict):
continue
if data.get("status") != "published":
continue
if args.track and data.get("track") != args.track:
continue
rows.append(review_record(path, data))
write_jsonl(args.out_dir / "published_semantic_queue.jsonl", rows)
by_track: dict[str, list[dict[str, Any]]] = {}
for row in rows:
by_track.setdefault(str(row.get("track")), []).append(row)
for track, track_rows in sorted(by_track.items()):
write_jsonl(args.out_dir / f"{track}_published_semantic_queue.jsonl", track_rows)
for idx in range(0, len(track_rows), args.batch_size):
batch_no = idx // args.batch_size + 1
batch_path = args.out_dir / "batches" / track / f"{track}_batch_{batch_no:03d}.jsonl"
write_jsonl(batch_path, track_rows[idx : idx + args.batch_size])
prompt = """You are reviewing StaffML interview-question YAML for release quality.
For each JSONL record, evaluate:
1. scenario_question_fit: Does the question follow from the scenario?
2. answer_correctness: Does realistic_solution answer the question directly?
3. common_mistake_quality: Is the pitfall plausible, specific, and useful?
4. napkin_math_correctness: Are formulas, units, and conclusions correct?
5. physical_plausibility: Are hardware/software numbers realistic?
6. level_fit: Does level/bloom/zone match the cognitive demand?
7. title_quality: Is the title concrete and searchable?
Return one JSON object per input record with:
qid, verdict ("pass"|"needs_fix"), severity ("blocker"|"major"|"minor"|"none"),
issues [short strings], suggested_fix_summary, and confidence.
Do not edit YAML directly during review. Produce findings only.
"""
(args.out_dir / "semantic_review_prompt.md").write_text(prompt)
print(f"Wrote {len(rows)} published-question records to {rel(args.out_dir)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())