Files
cs249r_book/interviews/vault/scripts/audit_resources_migration.py
Vijay Janapa Reddi 04eddd2b35 chore(vault): add Phase 0 audit for deep_dive → resources migration
One-shot read-only script that walks every question YAML and reports:
- total questions, deep_dive coverage, hostname distribution
- book-host references (mlsysbook.ai, harvard-edge.github.io)
- orphans missing title (name-fallback candidates during migration)
- questions whose only ref is a book URL (would lose all refs)

Phase 0 finding from first run against 9657 question YAMLs:
- ZERO questions have the details.deep_dive field populated
- Confirms the corpus was already stripped of per-question references
  during an earlier vault migration; the refs.ts header comment about
  "4,000+ deep_dive_url values" reflects pre-migration state
- The UI conditional on current.details.deep_dive_url in practice/page.tsx
  currently renders for zero questions — it is dead code

Implication: the planned deep_dive → resources migration does not need
to touch any question YAMLs. The change reduces to (a) schema evolution,
(b) dead UI removal, (c) manifest + probe deletion. The audit script is
retained as a regression guard — if the field ever comes back it surfaces
in the next audit run.

Report output is gitignored via scripts/_*.json pattern.
2026-04-16 18:11:59 -04:00

183 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Phase 0 audit for the deep_dive → resources migration.
Walks every question YAML under interviews/vault/questions/, extracts the
details.deep_dive field (when present), and emits counts that determine
the migration's blast radius:
- total questions
- questions with deep_dive present / absent
- URL hostname distribution
- book-host questions (mlsysbook.ai, harvard-edge.github.io) — these get
dropped on the floor during migration per the resources-list plan
- orphan questions: have deep_dive.url but no deep_dive.title (name fallback
needed during migration)
- "book-only" questions: their single deep_dive is a book URL, so post-migration
they have zero resources — author review candidate list
Read-only. Writes one JSON report to scripts/_resources_migration_audit.json
and prints a human-readable summary to stdout.
"""
from __future__ import annotations
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
from urllib.parse import urlparse
import yaml
ROOT = Path(__file__).resolve().parent.parent
QUESTIONS_DIR = ROOT / "questions"
REPORT_PATH = Path(__file__).resolve().parent / "_resources_migration_audit.json"
BOOK_HOSTS = {"mlsysbook.ai", "harvard-edge.github.io"}
def host_of(url: str) -> str:
try:
h = urlparse(url).hostname or ""
return h.lower().removeprefix("www.")
except Exception:
return ""
def main() -> int:
yaml_files = sorted(QUESTIONS_DIR.rglob("*.yaml"))
total = len(yaml_files)
if total == 0:
print(f"FATAL: no YAML files found under {QUESTIONS_DIR}", file=sys.stderr)
return 2
with_deep_dive = 0
without_deep_dive = 0
title_missing = 0 # has url, no title (name fallback needed)
url_missing = 0 # has title, no url (degenerate — we should log)
hostname_counts: Counter[str] = Counter()
book_host_count = 0
book_only_ids: list[str] = [] # single deep_dive is a book URL -> zero refs post-migration
orphans_without_title: list[dict] = [] # id + url, for fallback naming
parse_failures: list[str] = []
# Track by (track, level, zone) for distribution sanity
by_track: Counter[str] = Counter()
by_track_with_ref: Counter[str] = Counter()
for fp in yaml_files:
rel = fp.relative_to(ROOT).as_posix()
# questions/<track>/<level>/<zone>/<id>.yaml
parts = fp.relative_to(QUESTIONS_DIR).parts
track = parts[0] if parts else "?"
by_track[track] += 1
try:
data = yaml.safe_load(fp.read_text(encoding="utf-8"))
except Exception as e:
parse_failures.append(f"{rel}: {e}")
continue
if not isinstance(data, dict):
parse_failures.append(f"{rel}: top-level not a dict")
continue
qid = data.get("id", fp.stem)
details = data.get("details") or {}
deep_dive = details.get("deep_dive") if isinstance(details, dict) else None
if not deep_dive or not isinstance(deep_dive, dict):
without_deep_dive += 1
continue
url = (deep_dive.get("url") or "").strip()
title = (deep_dive.get("title") or "").strip()
if not url and not title:
without_deep_dive += 1
continue
with_deep_dive += 1
by_track_with_ref[track] += 1
if not url:
url_missing += 1
continue
if not title:
title_missing += 1
orphans_without_title.append({"id": qid, "url": url, "path": rel})
host = host_of(url)
hostname_counts[host] += 1
if host in BOOK_HOSTS:
book_host_count += 1
# book-only = this question's single deep_dive is a book URL → migration drops it
book_only_ids.append(qid)
# Compose report
report = {
"audited_at_iso": "2026-04-16",
"total_questions": total,
"with_deep_dive": with_deep_dive,
"without_deep_dive": without_deep_dive,
"deep_dive_coverage_pct": round(100.0 * with_deep_dive / total, 1) if total else 0.0,
"title_missing_count": title_missing,
"url_missing_count": url_missing,
"book_host_count": book_host_count,
"book_host_pct_of_refs": round(100.0 * book_host_count / with_deep_dive, 1) if with_deep_dive else 0.0,
"top_hostnames": hostname_counts.most_common(25),
"by_track_total": dict(by_track),
"by_track_with_ref": dict(by_track_with_ref),
"parse_failure_count": len(parse_failures),
"parse_failures_sample": parse_failures[:10],
"orphans_without_title_count": title_missing,
"orphans_without_title_sample": orphans_without_title[:15],
"book_only_questions_will_lose_ref_sample": book_only_ids[:15],
"book_only_questions_total_to_lose_ref": book_host_count,
}
REPORT_PATH.write_text(json.dumps(report, indent=2), encoding="utf-8")
# Human summary
print()
print("═══════════════════════════════════════════════════════════")
print(" StaffML Phase 0 Audit — deep_dive → resources migration")
print("═══════════════════════════════════════════════════════════")
print(f" Questions walked : {total:>6}")
print(f" With deep_dive reference : {with_deep_dive:>6} ({report['deep_dive_coverage_pct']}%)")
print(f" Without any reference : {without_deep_dive:>6}")
print(f" Has URL but no title : {title_missing:>6} (name-fallback needed)")
print(f" Has title but no URL : {url_missing:>6} (degenerate)")
print()
print(f" Book-host references : {book_host_count:>6} ({report['book_host_pct_of_refs']}% of refs)")
print(f" → These drop during migration")
print(f" → Same {book_host_count} questions lose their only ref")
print()
print(" Track distribution:")
for t, c in sorted(by_track.items()):
wr = by_track_with_ref.get(t, 0)
pct = round(100.0 * wr / c, 1) if c else 0.0
print(f" {t:<8} total={c:>5} with_ref={wr:>5} ({pct}%)")
print()
print(" Top 15 hostnames in existing refs:")
for host, count in hostname_counts.most_common(15):
flag = " [BOOK]" if host in BOOK_HOSTS else ""
pct = round(100.0 * count / with_deep_dive, 1) if with_deep_dive else 0.0
print(f" {host:<40} {count:>5} ({pct:>5}%){flag}")
print()
if parse_failures:
print(f" ⚠️ Parse failures: {len(parse_failures)} (first 5 shown)")
for line in parse_failures[:5]:
print(f" - {line}")
print()
print(f" Full report → {REPORT_PATH.relative_to(ROOT)}")
print("═══════════════════════════════════════════════════════════")
return 0
if __name__ == "__main__":
sys.exit(main())