mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 02:03:55 -05:00
The flag is the StaffML frontend's local-dev fallback (read corpus.json from disk via NEXT_PUBLIC_VAULT_FALLBACK=static), not a deprecated path. "Legacy" implied "soon to be removed"; "local-json" describes its actual role and reads correctly in scripts and docs. - vault-cli: rename CLI flag, parameter, result key, and help text. - CI workflows + pre-commit config: invoke the new flag name. - All scripts that print the command (suggest_exemplars, pre_commit_corpus_guard, promote_validated, rename_legacy_ids, export_to_staffml, the paper analyze_corpus/generate_*) updated. - Comments and docs (ARCHITECTURE, CHANGELOG, REVIEWS, TESTING, MASSIVE_BUILD_RUNBOOK, DEPRECATED, AUTHORING, plus frontend comments and .env.example / .gitignore) updated. The "legacy_json" sentinel string in corpus_stats.json._meta.source is intentionally NOT renamed — it is a stable artifact format read by downstream paper-generation tooling.
118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
#!/usr/bin/env python3
|
|
"""Suggest candidate questions for the exemplar pool.
|
|
|
|
Queries vault.db for the highest-quality questions per topic, scored by:
|
|
- Has napkin_math (+3)
|
|
- Has common_mistake (+2)
|
|
- Solution length > 500 chars (+2)
|
|
- Scenario length > 300 chars (+1)
|
|
|
|
Outputs a ranked list and optionally a shell script of `vault mark-exemplar`
|
|
commands. Questions must first have their provenance changed from 'imported'
|
|
to 'human' in the YAML file before mark-exemplar will accept them.
|
|
|
|
Usage:
|
|
python3 interviews/vault-cli/scripts/suggest_exemplars.py \
|
|
--vault-dir interviews/vault \
|
|
--top 3 \
|
|
[--emit-script]
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sqlite3
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
|
|
def score_question(row: dict) -> int:
|
|
s = 0
|
|
if row.get("napkin_math"):
|
|
s += 3
|
|
if row.get("common_mistake"):
|
|
s += 2
|
|
sol = row.get("realistic_solution") or ""
|
|
if len(sol) > 500:
|
|
s += 2
|
|
scenario = row.get("scenario") or ""
|
|
if len(scenario) > 300:
|
|
s += 1
|
|
return s
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Suggest exemplar candidates")
|
|
parser.add_argument("--vault-dir", type=Path, default=Path("interviews/vault"))
|
|
parser.add_argument("--top", type=int, default=3, help="Candidates per topic")
|
|
parser.add_argument("--emit-script", action="store_true", help="Print shell commands")
|
|
parser.add_argument("--min-topics", type=int, default=10, help="Cover at least N topics")
|
|
args = parser.parse_args()
|
|
|
|
db_path = args.vault_dir / "vault.db"
|
|
if not db_path.exists():
|
|
print(f"error: {db_path} not found. Run `vault build` first.")
|
|
raise SystemExit(1)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
rows = conn.execute(
|
|
"SELECT id, title, topic, track, level, zone, scenario, "
|
|
"common_mistake, realistic_solution, napkin_math, file_path "
|
|
"FROM questions WHERE status = 'published' ORDER BY topic, id"
|
|
).fetchall()
|
|
|
|
# Group by topic and score
|
|
by_topic: dict[str, list[dict]] = defaultdict(list)
|
|
for r in rows:
|
|
d = dict(r)
|
|
d["score"] = score_question(d)
|
|
by_topic[d["topic"]].append(d)
|
|
|
|
# Sort each topic by score descending
|
|
for topic in by_topic:
|
|
by_topic[topic].sort(key=lambda x: x["score"], reverse=True)
|
|
|
|
# Select top N per topic
|
|
selected: list[dict] = []
|
|
for topic in sorted(by_topic.keys()):
|
|
candidates = by_topic[topic][: args.top]
|
|
selected.extend(candidates)
|
|
|
|
# Print summary
|
|
topics_covered = len(set(q["topic"] for q in selected))
|
|
print(f"Selected {len(selected)} candidates across {topics_covered} topics\n")
|
|
|
|
if args.emit_script:
|
|
print("#!/bin/bash")
|
|
print("# Run after changing provenance to 'human' in each YAML file")
|
|
print(f"# Generated by suggest_exemplars.py --top {args.top}\n")
|
|
for q in selected:
|
|
print(f"# [{q['topic']}] {q['title'][:60]} (score={q['score']})")
|
|
print(f"vault mark-exemplar {q['id']} --vault-dir {args.vault_dir}")
|
|
print()
|
|
else:
|
|
# Print table
|
|
current_topic = None
|
|
for q in selected:
|
|
if q["topic"] != current_topic:
|
|
current_topic = q["topic"]
|
|
print(f"\n── {current_topic} ──")
|
|
print(
|
|
f" [{q['score']}] {q['id']}"
|
|
f" {q['level']}/{q['zone']}/{q['track']}"
|
|
f" {q['title'][:70]}"
|
|
)
|
|
|
|
print(f"\nTotal: {len(selected)} candidates, {topics_covered} topics")
|
|
print(
|
|
"\nNext steps:"
|
|
"\n 1. Change provenance: imported → human in each YAML"
|
|
"\n 2. Run: vault mark-exemplar <id> --vault-dir interviews/vault"
|
|
"\n 3. Run: vault build --local-json --vault-dir interviews/vault"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|