mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 02:03:55 -05:00
One-shot read-only script that walks every question YAML and reports: - total questions, deep_dive coverage, hostname distribution - book-host references (mlsysbook.ai, harvard-edge.github.io) - orphans missing title (name-fallback candidates during migration) - questions whose only ref is a book URL (would lose all refs) Phase 0 finding from first run against 9657 question YAMLs: - ZERO questions have the details.deep_dive field populated - Confirms the corpus was already stripped of per-question references during an earlier vault migration; the refs.ts header comment about "4,000+ deep_dive_url values" reflects pre-migration state - The UI conditional on current.details.deep_dive_url in practice/page.tsx currently renders for zero questions — it is dead code Implication: the planned deep_dive → resources migration does not need to touch any question YAMLs. The change reduces to (a) schema evolution, (b) dead UI removal, (c) manifest + probe deletion. The audit script is retained as a regression guard — if the field ever comes back it surfaces in the next audit run. Report output is gitignored via scripts/_*.json pattern.
183 lines
7.1 KiB
Python
183 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Phase 0 audit for the deep_dive → resources migration.
|
|
|
|
Walks every question YAML under interviews/vault/questions/, extracts the
|
|
details.deep_dive field (when present), and emits counts that determine
|
|
the migration's blast radius:
|
|
|
|
- total questions
|
|
- questions with deep_dive present / absent
|
|
- URL hostname distribution
|
|
- book-host questions (mlsysbook.ai, harvard-edge.github.io) — these get
|
|
dropped on the floor during migration per the resources-list plan
|
|
- orphan questions: have deep_dive.url but no deep_dive.title (name fallback
|
|
needed during migration)
|
|
- "book-only" questions: their single deep_dive is a book URL, so post-migration
|
|
they have zero resources — author review candidate list
|
|
|
|
Read-only. Writes one JSON report to scripts/_resources_migration_audit.json
|
|
and prints a human-readable summary to stdout.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
import yaml
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
QUESTIONS_DIR = ROOT / "questions"
|
|
REPORT_PATH = Path(__file__).resolve().parent / "_resources_migration_audit.json"
|
|
|
|
BOOK_HOSTS = {"mlsysbook.ai", "harvard-edge.github.io"}
|
|
|
|
|
|
def host_of(url: str) -> str:
|
|
try:
|
|
h = urlparse(url).hostname or ""
|
|
return h.lower().removeprefix("www.")
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def main() -> int:
|
|
yaml_files = sorted(QUESTIONS_DIR.rglob("*.yaml"))
|
|
total = len(yaml_files)
|
|
if total == 0:
|
|
print(f"FATAL: no YAML files found under {QUESTIONS_DIR}", file=sys.stderr)
|
|
return 2
|
|
|
|
with_deep_dive = 0
|
|
without_deep_dive = 0
|
|
title_missing = 0 # has url, no title (name fallback needed)
|
|
url_missing = 0 # has title, no url (degenerate — we should log)
|
|
hostname_counts: Counter[str] = Counter()
|
|
book_host_count = 0
|
|
book_only_ids: list[str] = [] # single deep_dive is a book URL -> zero refs post-migration
|
|
orphans_without_title: list[dict] = [] # id + url, for fallback naming
|
|
parse_failures: list[str] = []
|
|
|
|
# Track by (track, level, zone) for distribution sanity
|
|
by_track: Counter[str] = Counter()
|
|
by_track_with_ref: Counter[str] = Counter()
|
|
|
|
for fp in yaml_files:
|
|
rel = fp.relative_to(ROOT).as_posix()
|
|
# questions/<track>/<level>/<zone>/<id>.yaml
|
|
parts = fp.relative_to(QUESTIONS_DIR).parts
|
|
track = parts[0] if parts else "?"
|
|
by_track[track] += 1
|
|
|
|
try:
|
|
data = yaml.safe_load(fp.read_text(encoding="utf-8"))
|
|
except Exception as e:
|
|
parse_failures.append(f"{rel}: {e}")
|
|
continue
|
|
|
|
if not isinstance(data, dict):
|
|
parse_failures.append(f"{rel}: top-level not a dict")
|
|
continue
|
|
|
|
qid = data.get("id", fp.stem)
|
|
details = data.get("details") or {}
|
|
deep_dive = details.get("deep_dive") if isinstance(details, dict) else None
|
|
|
|
if not deep_dive or not isinstance(deep_dive, dict):
|
|
without_deep_dive += 1
|
|
continue
|
|
|
|
url = (deep_dive.get("url") or "").strip()
|
|
title = (deep_dive.get("title") or "").strip()
|
|
|
|
if not url and not title:
|
|
without_deep_dive += 1
|
|
continue
|
|
|
|
with_deep_dive += 1
|
|
by_track_with_ref[track] += 1
|
|
|
|
if not url:
|
|
url_missing += 1
|
|
continue
|
|
|
|
if not title:
|
|
title_missing += 1
|
|
orphans_without_title.append({"id": qid, "url": url, "path": rel})
|
|
|
|
host = host_of(url)
|
|
hostname_counts[host] += 1
|
|
|
|
if host in BOOK_HOSTS:
|
|
book_host_count += 1
|
|
# book-only = this question's single deep_dive is a book URL → migration drops it
|
|
book_only_ids.append(qid)
|
|
|
|
# Compose report
|
|
report = {
|
|
"audited_at_iso": "2026-04-16",
|
|
"total_questions": total,
|
|
"with_deep_dive": with_deep_dive,
|
|
"without_deep_dive": without_deep_dive,
|
|
"deep_dive_coverage_pct": round(100.0 * with_deep_dive / total, 1) if total else 0.0,
|
|
"title_missing_count": title_missing,
|
|
"url_missing_count": url_missing,
|
|
"book_host_count": book_host_count,
|
|
"book_host_pct_of_refs": round(100.0 * book_host_count / with_deep_dive, 1) if with_deep_dive else 0.0,
|
|
"top_hostnames": hostname_counts.most_common(25),
|
|
"by_track_total": dict(by_track),
|
|
"by_track_with_ref": dict(by_track_with_ref),
|
|
"parse_failure_count": len(parse_failures),
|
|
"parse_failures_sample": parse_failures[:10],
|
|
"orphans_without_title_count": title_missing,
|
|
"orphans_without_title_sample": orphans_without_title[:15],
|
|
"book_only_questions_will_lose_ref_sample": book_only_ids[:15],
|
|
"book_only_questions_total_to_lose_ref": book_host_count,
|
|
}
|
|
|
|
REPORT_PATH.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
|
|
|
# Human summary
|
|
print()
|
|
print("═══════════════════════════════════════════════════════════")
|
|
print(" StaffML Phase 0 Audit — deep_dive → resources migration")
|
|
print("═══════════════════════════════════════════════════════════")
|
|
print(f" Questions walked : {total:>6}")
|
|
print(f" With deep_dive reference : {with_deep_dive:>6} ({report['deep_dive_coverage_pct']}%)")
|
|
print(f" Without any reference : {without_deep_dive:>6}")
|
|
print(f" Has URL but no title : {title_missing:>6} (name-fallback needed)")
|
|
print(f" Has title but no URL : {url_missing:>6} (degenerate)")
|
|
print()
|
|
print(f" Book-host references : {book_host_count:>6} ({report['book_host_pct_of_refs']}% of refs)")
|
|
print(f" → These drop during migration")
|
|
print(f" → Same {book_host_count} questions lose their only ref")
|
|
print()
|
|
print(" Track distribution:")
|
|
for t, c in sorted(by_track.items()):
|
|
wr = by_track_with_ref.get(t, 0)
|
|
pct = round(100.0 * wr / c, 1) if c else 0.0
|
|
print(f" {t:<8} total={c:>5} with_ref={wr:>5} ({pct}%)")
|
|
print()
|
|
print(" Top 15 hostnames in existing refs:")
|
|
for host, count in hostname_counts.most_common(15):
|
|
flag = " [BOOK]" if host in BOOK_HOSTS else ""
|
|
pct = round(100.0 * count / with_deep_dive, 1) if with_deep_dive else 0.0
|
|
print(f" {host:<40} {count:>5} ({pct:>5}%){flag}")
|
|
print()
|
|
if parse_failures:
|
|
print(f" ⚠️ Parse failures: {len(parse_failures)} (first 5 shown)")
|
|
for line in parse_failures[:5]:
|
|
print(f" - {line}")
|
|
print()
|
|
print(f" Full report → {REPORT_PATH.relative_to(ROOT)}")
|
|
print("═══════════════════════════════════════════════════════════")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|