Files
cs249r_book/interviews/vault-cli/scripts/suggest_exemplars.py
Vijay Janapa Reddi 9fdbfb9a4c refactor(vault-cli): rename --legacy-json to --local-json
The flag is the StaffML frontend's local-dev fallback (read corpus.json
from disk via NEXT_PUBLIC_VAULT_FALLBACK=static), not a deprecated path.
"Legacy" implied "soon to be removed"; "local-json" describes its actual
role and reads correctly in scripts and docs.

- vault-cli: rename CLI flag, parameter, result key, and help text.
- CI workflows + pre-commit config: invoke the new flag name.
- All scripts that print the command (suggest_exemplars,
  pre_commit_corpus_guard, promote_validated, rename_legacy_ids,
  export_to_staffml, the paper analyze_corpus/generate_*) updated.
- Comments and docs (ARCHITECTURE, CHANGELOG, REVIEWS, TESTING,
  MASSIVE_BUILD_RUNBOOK, DEPRECATED, AUTHORING, plus frontend
  comments and .env.example / .gitignore) updated.

The "legacy_json" sentinel string in corpus_stats.json._meta.source
is intentionally NOT renamed — it is a stable artifact format read
by downstream paper-generation tooling.
2026-04-30 09:30:28 -04:00

118 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Suggest candidate questions for the exemplar pool.
Queries vault.db for the highest-quality questions per topic, scored by:
- Has napkin_math (+3)
- Has common_mistake (+2)
- Solution length > 500 chars (+2)
- Scenario length > 300 chars (+1)
Outputs a ranked list and optionally a shell script of `vault mark-exemplar`
commands. Questions must first have their provenance changed from 'imported'
to 'human' in the YAML file before mark-exemplar will accept them.
Usage:
python3 interviews/vault-cli/scripts/suggest_exemplars.py \
--vault-dir interviews/vault \
--top 3 \
[--emit-script]
"""
from __future__ import annotations
import argparse
import sqlite3
from collections import defaultdict
from pathlib import Path
def score_question(row: dict) -> int:
s = 0
if row.get("napkin_math"):
s += 3
if row.get("common_mistake"):
s += 2
sol = row.get("realistic_solution") or ""
if len(sol) > 500:
s += 2
scenario = row.get("scenario") or ""
if len(scenario) > 300:
s += 1
return s
def main() -> None:
parser = argparse.ArgumentParser(description="Suggest exemplar candidates")
parser.add_argument("--vault-dir", type=Path, default=Path("interviews/vault"))
parser.add_argument("--top", type=int, default=3, help="Candidates per topic")
parser.add_argument("--emit-script", action="store_true", help="Print shell commands")
parser.add_argument("--min-topics", type=int, default=10, help="Cover at least N topics")
args = parser.parse_args()
db_path = args.vault_dir / "vault.db"
if not db_path.exists():
print(f"error: {db_path} not found. Run `vault build` first.")
raise SystemExit(1)
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
rows = conn.execute(
"SELECT id, title, topic, track, level, zone, scenario, "
"common_mistake, realistic_solution, napkin_math, file_path "
"FROM questions WHERE status = 'published' ORDER BY topic, id"
).fetchall()
# Group by topic and score
by_topic: dict[str, list[dict]] = defaultdict(list)
for r in rows:
d = dict(r)
d["score"] = score_question(d)
by_topic[d["topic"]].append(d)
# Sort each topic by score descending
for topic in by_topic:
by_topic[topic].sort(key=lambda x: x["score"], reverse=True)
# Select top N per topic
selected: list[dict] = []
for topic in sorted(by_topic.keys()):
candidates = by_topic[topic][: args.top]
selected.extend(candidates)
# Print summary
topics_covered = len(set(q["topic"] for q in selected))
print(f"Selected {len(selected)} candidates across {topics_covered} topics\n")
if args.emit_script:
print("#!/bin/bash")
print("# Run after changing provenance to 'human' in each YAML file")
print(f"# Generated by suggest_exemplars.py --top {args.top}\n")
for q in selected:
print(f"# [{q['topic']}] {q['title'][:60]} (score={q['score']})")
print(f"vault mark-exemplar {q['id']} --vault-dir {args.vault_dir}")
print()
else:
# Print table
current_topic = None
for q in selected:
if q["topic"] != current_topic:
current_topic = q["topic"]
print(f"\n── {current_topic} ──")
print(
f" [{q['score']}] {q['id']}"
f" {q['level']}/{q['zone']}/{q['track']}"
f" {q['title'][:70]}"
)
print(f"\nTotal: {len(selected)} candidates, {topics_covered} topics")
print(
"\nNext steps:"
"\n 1. Change provenance: imported → human in each YAML"
"\n 2. Run: vault mark-exemplar <id> --vault-dir interviews/vault"
"\n 3. Run: vault build --local-json --vault-dir interviews/vault"
)
if __name__ == "__main__":
main()