Files
cs249r_book/interviews/vault-cli/scripts/summarize_proposed_chains.py
Vijay Janapa Reddi 5225059754 fix(vault-cli): clear ruff violations flagged by --all-files sweep
Auto-fix removed extraneous f-string prefixes, unused imports
(re, sys, textwrap, defaultdict), an unused local (qids), and
converted datetime.now(timezone.utc) to datetime.now(UTC) (UP017).
Manual fixes split colon/semicolon one-liners onto separate lines
(E701/E702), renamed unused loop vars (cid, chain_id) with leading
underscores (B007), replaced bare except with except Exception (E722),
and renamed loop var L to level to satisfy N806.
2026-05-02 09:17:15 -04:00

101 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""Summarize a proposed chains.json — distribution, stats, sample inspection.
Run after build_chains_with_gemini.py to see what was produced before
applying. Produces a quick-read text report.
"""
from __future__ import annotations
import argparse
import json
from collections import Counter
from pathlib import Path
VAULT_DIR = Path(__file__).resolve().parents[2] / "vault"
DEFAULT = VAULT_DIR / "chains.proposed.json"
LEVEL_RANK = {"L1": 1, "L2": 2, "L3": 3, "L4": 4, "L5": 5, "L6+": 6}
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--input", default=str(DEFAULT))
ap.add_argument("--samples", type=int, default=5, help="Show N sample chains")
args = ap.parse_args()
chains = json.loads(Path(args.input).read_text())
n = len(chains)
print("=" * 60)
print(f"PROPOSED CHAINS — {n} total")
print("=" * 60)
sizes = Counter(len(ch["questions"]) for ch in chains)
print("\nchain size distribution:")
for size in sorted(sizes):
print(f" size {size}: {sizes[size]} chains")
track_counts = Counter(ch["track"] for ch in chains)
print("\nchains per track:")
for t, c in track_counts.most_common():
print(f" {t}: {c}")
# Level deltas
deltas = Counter()
starts = Counter()
for ch in chains:
levels = [LEVEL_RANK.get(q.get("level"), 0) for q in ch["questions"]]
starts[ch["questions"][0].get("level")] += 1
for i in range(len(levels) - 1):
deltas[levels[i+1] - levels[i]] += 1
print("\nstart-level distribution:")
for level in ("L1", "L2", "L3", "L4", "L5", "L6+"):
if level in starts:
print(f" {level}: {starts[level]}")
print("\nconsecutive-member level Δ:")
for d, c in sorted(deltas.items()):
bar = "" * min(c // 10, 60)
print(f" Δ={d:+d} {c:>4} {bar}")
# Multi-membership
qid_count = Counter()
for ch in chains:
for q in ch["questions"]:
qid_count[q["id"]] += 1
multi = Counter(qid_count.values())
total_chained_qids = len(qid_count)
print("\nmulti-chain membership:")
print(f" total questions in any chain: {total_chained_qids}")
for n_chains, count in sorted(multi.items()):
if n_chains == 1:
continue
print(f" in {n_chains} chains: {count} questions")
# Topic coverage
topics = Counter(ch["topic"] for ch in chains)
print("\ntopic coverage:")
print(f" topics with at least 1 chain: {len(topics)}")
print(f" most-chained topic: {topics.most_common(1)[0]}")
# Sample chains
if args.samples and chains:
print(f"\n{'=' * 60}")
print(f"SAMPLE CHAINS (first {args.samples})")
print("=" * 60)
for ch in chains[:args.samples]:
levels_str = "".join(q["level"] for q in ch["questions"])
print(f"\n{ch['chain_id']} | {ch['track']} | {ch['topic']}")
print(f" levels: {levels_str}")
for i, q in enumerate(ch["questions"]):
print(f" pos {i} {q['level']} {q['id']} '{q['title'][:60]}'")
if ch.get("rationale"):
print(f" rationale: {ch['rationale']}")
return 0
if __name__ == "__main__":
import sys
sys.exit(main())