cs249r_book/interviews/vault/scripts/audit_resources_migration.py

#!/usr/bin/env python3
"""
Phase 0 audit for the deep_dive → resources migration.

Walks every question YAML under interviews/vault/questions/, extracts the
details.deep_dive field (when present), and emits counts that determine
the migration's blast radius:

  - total questions
  - questions with deep_dive present / absent
  - URL hostname distribution
  - book-host questions (mlsysbook.ai, harvard-edge.github.io) — these get
    dropped on the floor during migration per the resources-list plan
  - orphan questions: have deep_dive.url but no deep_dive.title (name fallback
    needed during migration)
  - "book-only" questions: their single deep_dive is a book URL, so post-migration
    they have zero resources — author review candidate list

Read-only. Writes one JSON report to scripts/_resources_migration_audit.json
and prints a human-readable summary to stdout.
"""

from __future__ import annotations

import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
from urllib.parse import urlparse

import yaml

ROOT = Path(__file__).resolve().parent.parent
QUESTIONS_DIR = ROOT / "questions"
REPORT_PATH = Path(__file__).resolve().parent / "_resources_migration_audit.json"

BOOK_HOSTS = {"mlsysbook.ai", "harvard-edge.github.io"}


def host_of(url: str) -> str:
    try:
        h = urlparse(url).hostname or ""
        return h.lower().removeprefix("www.")
    except Exception:
        return ""


def main() -> int:
    yaml_files = sorted(QUESTIONS_DIR.rglob("*.yaml"))
    total = len(yaml_files)
    if total == 0:
        print(f"FATAL: no YAML files found under {QUESTIONS_DIR}", file=sys.stderr)
        return 2

    with_deep_dive = 0
    without_deep_dive = 0
    title_missing = 0               # has url, no title (name fallback needed)
    url_missing = 0                 # has title, no url (degenerate — we should log)
    hostname_counts: Counter[str] = Counter()
    book_host_count = 0
    book_only_ids: list[str] = []   # single deep_dive is a book URL -> zero refs post-migration
    orphans_without_title: list[dict] = []   # id + url, for fallback naming
    parse_failures: list[str] = []

    # Track by (track, level, zone) for distribution sanity
    by_track: Counter[str] = Counter()
    by_track_with_ref: Counter[str] = Counter()

    for fp in yaml_files:
        rel = fp.relative_to(ROOT).as_posix()
        # questions/<track>/<level>/<zone>/<id>.yaml
        parts = fp.relative_to(QUESTIONS_DIR).parts
        track = parts[0] if parts else "?"
        by_track[track] += 1

        try:
            data = yaml.safe_load(fp.read_text(encoding="utf-8"))
        except Exception as e:
            parse_failures.append(f"{rel}: {e}")
            continue

        if not isinstance(data, dict):
            parse_failures.append(f"{rel}: top-level not a dict")
            continue

        qid = data.get("id", fp.stem)
        details = data.get("details") or {}
        deep_dive = details.get("deep_dive") if isinstance(details, dict) else None

        if not deep_dive or not isinstance(deep_dive, dict):
            without_deep_dive += 1
            continue

        url = (deep_dive.get("url") or "").strip()
        title = (deep_dive.get("title") or "").strip()

        if not url and not title:
            without_deep_dive += 1
            continue

        with_deep_dive += 1
        by_track_with_ref[track] += 1

        if not url:
            url_missing += 1
            continue

        if not title:
            title_missing += 1
            orphans_without_title.append({"id": qid, "url": url, "path": rel})

        host = host_of(url)
        hostname_counts[host] += 1

        if host in BOOK_HOSTS:
            book_host_count += 1
            # book-only = this question's single deep_dive is a book URL → migration drops it
            book_only_ids.append(qid)

    # Compose report
    report = {
        "audited_at_iso": "2026-04-16",
        "total_questions": total,
        "with_deep_dive": with_deep_dive,
        "without_deep_dive": without_deep_dive,
        "deep_dive_coverage_pct": round(100.0 * with_deep_dive / total, 1) if total else 0.0,
        "title_missing_count": title_missing,
        "url_missing_count": url_missing,
        "book_host_count": book_host_count,
        "book_host_pct_of_refs": round(100.0 * book_host_count / with_deep_dive, 1) if with_deep_dive else 0.0,
        "top_hostnames": hostname_counts.most_common(25),
        "by_track_total": dict(by_track),
        "by_track_with_ref": dict(by_track_with_ref),
        "parse_failure_count": len(parse_failures),
        "parse_failures_sample": parse_failures[:10],
        "orphans_without_title_count": title_missing,
        "orphans_without_title_sample": orphans_without_title[:15],
        "book_only_questions_will_lose_ref_sample": book_only_ids[:15],
        "book_only_questions_total_to_lose_ref": book_host_count,
    }

    REPORT_PATH.write_text(json.dumps(report, indent=2), encoding="utf-8")

    # Human summary
    print()
    print("═══════════════════════════════════════════════════════════")
    print("  StaffML Phase 0 Audit — deep_dive → resources migration")
    print("═══════════════════════════════════════════════════════════")
    print(f"  Questions walked              : {total:>6}")
    print(f"  With deep_dive reference      : {with_deep_dive:>6}  ({report['deep_dive_coverage_pct']}%)")
    print(f"  Without any reference         : {without_deep_dive:>6}")
    print(f"  Has URL but no title          : {title_missing:>6}  (name-fallback needed)")
    print(f"  Has title but no URL          : {url_missing:>6}  (degenerate)")
    print()
    print(f"  Book-host references          : {book_host_count:>6}  ({report['book_host_pct_of_refs']}% of refs)")
    print(f"  → These drop during migration")
    print(f"  → Same {book_host_count} questions lose their only ref")
    print()
    print("  Track distribution:")
    for t, c in sorted(by_track.items()):
        wr = by_track_with_ref.get(t, 0)
        pct = round(100.0 * wr / c, 1) if c else 0.0
        print(f"    {t:<8} total={c:>5}  with_ref={wr:>5}  ({pct}%)")
    print()
    print("  Top 15 hostnames in existing refs:")
    for host, count in hostname_counts.most_common(15):
        flag = "  [BOOK]" if host in BOOK_HOSTS else ""
        pct = round(100.0 * count / with_deep_dive, 1) if with_deep_dive else 0.0
        print(f"    {host:<40} {count:>5}  ({pct:>5}%){flag}")
    print()
    if parse_failures:
        print(f"  ⚠️  Parse failures: {len(parse_failures)} (first 5 shown)")
        for line in parse_failures[:5]:
            print(f"     - {line}")
        print()
    print(f"  Full report → {REPORT_PATH.relative_to(ROOT)}")
    print("═══════════════════════════════════════════════════════════")
    return 0


if __name__ == "__main__":
    sys.exit(main())