Files
cs249r_book/interviews/vault-cli/scripts/exemplar_coverage_audit.py
Vijay Janapa Reddi 43dedf9948 docs(vault): update architecture docs and audit scripts for 87-topic baseline
Update ARCHITECTURE.md to reflect 87 curated topics and 131 edges. Refactor exemplar_coverage_audit.py to use vault.db instead of retired corpus.json. Update exemplar-gaps.yaml inventory.
2026-04-26 16:47:56 -04:00

127 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""Phase-0 exemplar-coverage audit.
Reads the current ``corpus.json`` and reports the per-(track, level, zone) cell
distribution of questions, flagging cells with fewer than 3 eligible exemplars.
As of Phase 0, the corpus does not carry a ``provenance`` field (that lands with
the YAML split in Phase 1). We therefore report raw per-cell counts AND
explicitly mark exemplar eligibility as ``unknown`` pending Phase-1 provenance
backfill. The audit shape is stable so Phase-1 re-runs slot in without
refactoring.
Referenced from ARCHITECTURE.md §14 Phase 0 milestone and REVIEWS.md R2-3 N-H3.
"""
from __future__ import annotations
import json
import sqlite3
import sys
from collections import Counter
from pathlib import Path
from typing import Any
EXEMPLAR_MIN = 3 # minimum eligible exemplars per (track, level, zone) cell
def load_corpus_from_db(db_path: Path) -> list[dict[str, Any]]:
"""Load questions from the vault.db SQLite file."""
if not db_path.exists():
raise SystemExit(f"error: vault.db not found at {db_path}")
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.execute(
"SELECT track, level, zone, status, provenance FROM questions"
)
# Mock 'validated' as True for published questions until we have a real field
return [dict(row) for row in cursor.fetchall()]
def is_exemplar_eligible(q: dict[str, Any]) -> bool:
"""Whether this question could serve as an exemplar today."""
if q.get("status") != "published":
return False
# For legacy compatibility with Phase 0 logic, we don't have a 'validated'
# column in SQL yet, so we assume published questions are validated.
provenance = q.get("provenance")
if provenance is None:
return False
return provenance in {"human", "llm-then-human-edited"}
def audit(corpus: list[dict[str, Any]]) -> dict[str, Any]:
"""Group by (track, level, zone) and count total vs eligible per cell."""
total: Counter[tuple[str, str, str]] = Counter()
eligible: Counter[tuple[str, str, str]] = Counter()
for q in corpus:
track = (q.get("track") or "").lower() or "__missing__"
level = (q.get("level") or "").lower() or "__missing__"
zone = (q.get("zone") or "").lower() or "__missing__"
cell = (track, level, zone)
total[cell] += 1
if is_exemplar_eligible(q):
eligible[cell] += 1
cells = []
for cell, count in sorted(total.items()):
track, level, zone = cell
elig = eligible[cell]
cells.append({
"track": track, "level": level, "zone": zone,
"total_questions": count,
"eligible_exemplars": elig,
"gap": max(0, EXEMPLAR_MIN - elig),
})
return {
"phase": 1,
"note": (
"Phase-1 audit: using vault.db as source of truth. eligible_exemplars "
"count reflects the provenance field in the SQLite database."
),
"exemplar_minimum_per_cell": EXEMPLAR_MIN,
"total_cells": len(cells),
"cells_with_gap": sum(1 for c in cells if c["gap"] > 0),
"cells": cells,
}
def emit_yaml(report: dict[str, Any], out: Path) -> None:
"""Write YAML without importing PyYAML (keep Phase-0/1 deps minimal)."""
lines = [
f"phase: {report['phase']}",
f"note: {json.dumps(report['note'])}",
f"exemplar_minimum_per_cell: {report['exemplar_minimum_per_cell']}",
f"total_cells: {report['total_cells']}",
f"cells_with_gap: {report['cells_with_gap']}",
"cells:",
]
for c in report["cells"]:
lines.append(
f" - {{track: {c['track']}, level: {c['level']}, zone: {c['zone']}, "
f"total_questions: {c['total_questions']}, "
f"eligible_exemplars: {c['eligible_exemplars']}, gap: {c['gap']}}}"
)
out.write_text("\n".join(lines) + "\n", encoding="utf-8")
def main(argv: list[str]) -> int:
here = Path(__file__).resolve().parents[2] # interviews/
db_path = here / "vault" / "vault.db"
out_path = here / "vault" / "exemplar-gaps.yaml"
corpus = load_corpus_from_db(db_path)
report = audit(corpus)
emit_yaml(report, out_path)
print(f"exemplar audit: {report['total_cells']} cells, "
f"{report['cells_with_gap']} with gap < {EXEMPLAR_MIN} eligible")
print(f"report written to {out_path.relative_to(here.parent)}")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))