mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
Auto-fix removed extraneous f-string prefixes, unused imports (re, sys, textwrap, defaultdict), an unused local (qids), and converted datetime.now(timezone.utc) to datetime.now(UTC) (UP017). Manual fixes split colon/semicolon one-liners onto separate lines (E701/E702), renamed unused loop vars (cid, chain_id) with leading underscores (B007), replaced bare except with except Exception (E722), and renamed loop var L to level to satisfy N806.
101 lines
3.3 KiB
Python
101 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Summarize a proposed chains.json — distribution, stats, sample inspection.
|
|
|
|
Run after build_chains_with_gemini.py to see what was produced before
|
|
applying. Produces a quick-read text report.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
VAULT_DIR = Path(__file__).resolve().parents[2] / "vault"
|
|
DEFAULT = VAULT_DIR / "chains.proposed.json"
|
|
LEVEL_RANK = {"L1": 1, "L2": 2, "L3": 3, "L4": 4, "L5": 5, "L6+": 6}
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("--input", default=str(DEFAULT))
|
|
ap.add_argument("--samples", type=int, default=5, help="Show N sample chains")
|
|
args = ap.parse_args()
|
|
|
|
chains = json.loads(Path(args.input).read_text())
|
|
n = len(chains)
|
|
print("=" * 60)
|
|
print(f"PROPOSED CHAINS — {n} total")
|
|
print("=" * 60)
|
|
|
|
sizes = Counter(len(ch["questions"]) for ch in chains)
|
|
print("\nchain size distribution:")
|
|
for size in sorted(sizes):
|
|
print(f" size {size}: {sizes[size]} chains")
|
|
|
|
track_counts = Counter(ch["track"] for ch in chains)
|
|
print("\nchains per track:")
|
|
for t, c in track_counts.most_common():
|
|
print(f" {t}: {c}")
|
|
|
|
# Level deltas
|
|
deltas = Counter()
|
|
starts = Counter()
|
|
for ch in chains:
|
|
levels = [LEVEL_RANK.get(q.get("level"), 0) for q in ch["questions"]]
|
|
starts[ch["questions"][0].get("level")] += 1
|
|
for i in range(len(levels) - 1):
|
|
deltas[levels[i+1] - levels[i]] += 1
|
|
|
|
print("\nstart-level distribution:")
|
|
for level in ("L1", "L2", "L3", "L4", "L5", "L6+"):
|
|
if level in starts:
|
|
print(f" {level}: {starts[level]}")
|
|
|
|
print("\nconsecutive-member level Δ:")
|
|
for d, c in sorted(deltas.items()):
|
|
bar = "█" * min(c // 10, 60)
|
|
print(f" Δ={d:+d} {c:>4} {bar}")
|
|
|
|
# Multi-membership
|
|
qid_count = Counter()
|
|
for ch in chains:
|
|
for q in ch["questions"]:
|
|
qid_count[q["id"]] += 1
|
|
multi = Counter(qid_count.values())
|
|
total_chained_qids = len(qid_count)
|
|
print("\nmulti-chain membership:")
|
|
print(f" total questions in any chain: {total_chained_qids}")
|
|
for n_chains, count in sorted(multi.items()):
|
|
if n_chains == 1:
|
|
continue
|
|
print(f" in {n_chains} chains: {count} questions")
|
|
|
|
# Topic coverage
|
|
topics = Counter(ch["topic"] for ch in chains)
|
|
print("\ntopic coverage:")
|
|
print(f" topics with at least 1 chain: {len(topics)}")
|
|
print(f" most-chained topic: {topics.most_common(1)[0]}")
|
|
|
|
# Sample chains
|
|
if args.samples and chains:
|
|
print(f"\n{'=' * 60}")
|
|
print(f"SAMPLE CHAINS (first {args.samples})")
|
|
print("=" * 60)
|
|
for ch in chains[:args.samples]:
|
|
levels_str = " → ".join(q["level"] for q in ch["questions"])
|
|
print(f"\n{ch['chain_id']} | {ch['track']} | {ch['topic']}")
|
|
print(f" levels: {levels_str}")
|
|
for i, q in enumerate(ch["questions"]):
|
|
print(f" pos {i} {q['level']} {q['id']} '{q['title'][:60]}'")
|
|
if ch.get("rationale"):
|
|
print(f" rationale: {ch['rationale']}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.exit(main())
|