chore(staffml): remove chapter-url manifest and link-probe infrastructure

Removes the last active coupling between StaffML questions and the
mlsysbook.ai site:

Deleted files
=============
- interviews/staffml/src/data/chapter-urls.json
  27-entry chapter-id → relative-path map. All 27 URLs currently 404
  against production because the live site serves /contents/core/...
  while the manifest uses /contents/vol1|vol2/... paths.
- interviews/staffml/scripts/check-deep-dive-links.py
  Weekly URL-health probe that walked chapter-urls.json. Nothing else
  consumes it; its sole SOURCE_PATH was the manifest above.
- .github/workflows/staffml-link-check.yml
  Scheduled CI (cron '0 9 * * 1') + PR-comment + auto-issue-filing
  pipeline for the probe. With the probe gone, the workflow had no
  job left. Grep confirmed no other workflow depends on its
  'staffml-link-report' artifact name.

Modified
========
- interviews/staffml/scripts/DEPRECATED.md
  Drop the 'check-deep-dive-links.py' row (script no longer exists
  so the replacement pointer is no longer meaningful).
- interviews/staffml/.gitignore
  Drop the '_deep_dive_link_report.json' ignore (the file that
  produced it is gone).

What replaces this
==================
Nothing yet. Per the resources-list model adopted in the preceding
commits, per-question book links are an author-curated editorial
act — authors add { name, url } entries to Details.resources when
book URLs stabilize (mlsysbook.ai/vol1 still moving). Until then,
StaffML is deliberately self-contained for book-linking purposes.

Ecosystem-level cross-linking to the book remains via Nav.tsx's
existing 'MLSysBook.ai' header link (stable, points at homepage);
a more prominent affordance is planned for a follow-up commit.
This commit is contained in:
Vijay Janapa Reddi
2026-04-16 18:27:58 -04:00
parent 6e3ef2aa6f
commit 409d58c57b
5 changed files with 0 additions and 553 deletions

View File

@@ -1,250 +0,0 @@
name: '🎯 StaffML · 🔗 Link Check'
# =============================================================================
# StaffML — Chapter-URL manifest health check
# =============================================================================
#
# Probes every unique URL in src/data/chapter-urls.json (joined with
# mlsysbook.ai) via curl and uploads a structured JSON health report. Runs on
# a weekly schedule and on manual dispatch. Optionally fails the workflow if
# any new URLs are dead.
#
# Flow:
# 1. CHECKOUT — minimal clone, no deps
# 2. PROBE — python3 scripts/check-deep-dive-links.py
# 3. UPLOAD — _deep_dive_link_report.json as workflow artifact
# 4. NOTIFY — open an issue if broken-count regresses week-over-week
#
# Triggers:
# - schedule: every Monday 09:00 UTC
# - workflow_dispatch: manual run
# - pull_request: only when chapter-urls.json, refs.ts, or the
# probe itself is touched
#
# Related:
# - interviews/staffml/scripts/check-deep-dive-links.py
# - interviews/staffml/src/lib/refs.ts (consumer of chapter-urls.json)
#
# =============================================================================
on:
schedule:
# Mondays at 09:00 UTC — early enough to catch the weekend's regressions
- cron: '0 9 * * 1'
workflow_dispatch:
inputs:
fail_on_broken:
description: 'Fail the workflow if any URL is dead'
required: false
default: 'false'
type: choice
options: ['true', 'false']
pull_request:
paths:
- 'interviews/staffml/src/data/chapter-urls.json'
- 'interviews/staffml/src/lib/refs.ts'
- 'interviews/staffml/scripts/check-deep-dive-links.py'
- '.github/workflows/staffml-link-check.yml'
permissions:
contents: read
issues: write # for the scheduled regression-notify step (opens/updates an issue)
pull-requests: write # for the PR-comment step on pull_request events
concurrency:
group: staffml-link-check
cancel-in-progress: false # let scheduled runs always finish
jobs:
check-links:
name: '🔗 Probe corpus deep_dive_urls'
runs-on: ubuntu-latest
timeout-minutes: 15
steps:
- name: 📥 Checkout
uses: actions/checkout@v6
with:
# Minimal clone — we only need the corpus and the script
fetch-depth: 1
- name: 🐍 Verify Python
run: python3 --version
- name: 🔍 Verify curl
run: curl --version
- name: 🌐 Run link checker
id: probe
working-directory: interviews/staffml
run: |
set +e
python3 scripts/check-deep-dive-links.py --quiet | tee /tmp/link-check.log
rc=$?
set -e
if [ ! -f scripts/_deep_dive_link_report.json ]; then
echo "❌ Link checker did not produce a report"
exit 1
fi
# Extract summary metrics for downstream steps
eval "$(python3 - <<'PY'
import json
with open('scripts/_deep_dive_link_report.json') as f:
r = json.load(f)
total = r.get('total_links', 0)
unique = r.get('unique_urls', 0)
broken = r.get('broken_count', 0)
healthy = unique - broken
pct = (healthy / unique * 100) if unique else 0
print(f"TOTAL_LINKS={total}")
print(f"UNIQUE_URLS={unique}")
print(f"BROKEN_COUNT={broken}")
print(f"HEALTHY_COUNT={healthy}")
print(f"HEALTH_PCT={pct:.1f}")
PY
)"
# Publish to step outputs for downstream steps
{
echo "total_links=$TOTAL_LINKS"
echo "unique_urls=$UNIQUE_URLS"
echo "broken_count=$BROKEN_COUNT"
echo "healthy_count=$HEALTHY_COUNT"
echo "health_pct=$HEALTH_PCT"
} >> "$GITHUB_OUTPUT"
echo ""
echo "─── Link health summary ───"
echo "Total references : $TOTAL_LINKS"
echo "Unique URLs : $UNIQUE_URLS"
echo "Healthy : $HEALTHY_COUNT ($HEALTH_PCT%)"
echo "Broken : $BROKEN_COUNT"
# Honor the manual fail-on-broken input
if [ "${{ github.event.inputs.fail_on_broken }}" = "true" ] && [ "$rc" -ne 0 ]; then
echo "❌ Manual fail-on-broken requested and broken URLs found"
exit 1
fi
- name: 📤 Upload report artifact
if: always()
uses: actions/upload-artifact@v4
with:
name: staffml-link-report
path: interviews/staffml/scripts/_deep_dive_link_report.json
retention-days: 90
- name: 📊 PR comment with link health diff
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const report = JSON.parse(
fs.readFileSync('interviews/staffml/scripts/_deep_dive_link_report.json', 'utf8')
);
const total = report.total_links;
const unique = report.unique_urls;
const broken = report.broken_count;
const healthy = unique - broken;
const pct = unique ? ((healthy / unique) * 100).toFixed(1) : '0.0';
// Top 5 broken-by-impact, surfaced inline
const top = (report.broken || [])
.slice(0, 5)
.map(b => `| ${b.status} | ${b.occurrences} | \`${b.url.slice(0, 80)}\` |`)
.join('\n');
const body = [
'## 🔗 StaffML link-health report',
'',
`- **Total references**: ${total}`,
`- **Unique URLs**: ${unique}`,
`- **Healthy (2xx/3xx)**: ${healthy} (${pct}%)`,
`- **Broken**: ${broken}`,
'',
top ? '### Top 5 broken URLs by user-impact' : '',
top ? '| Status | Occurrences | URL |' : '',
top ? '|---|---|---|' : '',
top,
'',
'> Full report uploaded as workflow artifact `staffml-link-report`.',
].filter(Boolean).join('\n');
github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
- name: 🚨 Open issue on regression
# Only on scheduled runs — we want a single source of weekly truth
if: github.event_name == 'schedule'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const report = JSON.parse(
fs.readFileSync('interviews/staffml/scripts/_deep_dive_link_report.json', 'utf8')
);
const broken = report.broken_count || 0;
const unique = report.unique_urls || 0;
const healthy = unique - broken;
const pct = unique ? ((healthy / unique) * 100).toFixed(1) : '0.0';
// Only file an issue if health < 60% — adjust as the corpus heals
const HEALTH_THRESHOLD = 60.0;
if (parseFloat(pct) >= HEALTH_THRESHOLD) {
console.log(`Health ${pct}% >= threshold ${HEALTH_THRESHOLD}%, no issue filed.`);
return;
}
const top = (report.broken || [])
.slice(0, 10)
.map(b => `- [${b.status}] x${b.occurrences} \`${b.url}\``)
.join('\n');
const title = `[StaffML] Link health ${pct}% (${broken}/${unique} broken)`;
const body = [
`## Weekly StaffML link-health report`,
'',
`- **Healthy**: ${healthy}/${unique} (${pct}%)`,
`- **Broken**: ${broken}`,
`- **Threshold**: ${HEALTH_THRESHOLD}%`,
'',
`### Top 10 broken URLs by user-impact`,
top,
'',
`Full report attached as artifact \`staffml-link-report\` on the [workflow run](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}).`,
'',
`_Auto-filed by \`.github/workflows/staffml-link-check.yml\`._`,
].join('\n');
// Avoid duplicates: look for an open issue with the same title prefix
const existing = await github.rest.issues.listForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'open',
labels: 'staffml,link-health',
});
const dup = existing.data.find(i => i.title.startsWith('[StaffML] Link health'));
if (dup) {
console.log(`Updating existing issue #${dup.number}`);
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: dup.number,
body,
});
} else {
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title,
body,
labels: ['staffml', 'link-health'],
});
}

View File

@@ -5,9 +5,6 @@ out/
# Dependencies
node_modules/
# Link-checker report (generated by scripts/check-deep-dive-links.py)
scripts/_deep_dive_link_report.json
# Cloudflare Wrangler cache (worker dev/deploy)
worker/.wrangler/
worker/wrangler.toml.bak

View File

@@ -12,7 +12,6 @@ artifact) or pushed data into `src/data/corpus.json` (now emitted by
| `sync-vault.py` | Copied vault/corpus.json → src/data/ with filter | `vault build --legacy-json` emits site-compatible JSON directly |
| `generate-manifest.py` | Built src/data/vault-manifest.json | Built by `vault publish` as a release artifact |
| `validate-vault.py` | Sanity check on corpus shape | Covered by `vault check --strict` invariants |
| `check-deep-dive-links.py` | URL reachability | `vault check --tier slow` (nightly) |
| `format-napkin-math.py` | One-shot formatter | Obsolete |
| `sync-periodic-table.mjs` | Unrelated (periodic-table site feature) | Still active — NOT deprecated |

View File

@@ -1,270 +0,0 @@
#!/usr/bin/env python3
"""
Link checker for the StaffML → textbook chapter-URL manifest.
Walks src/data/chapter-urls.json (the 27-entry chapter-id → relative-path map
consumed by src/lib/refs.ts), prefixes each path with mlsysbook.ai, probes
each URL once via curl over a small concurrent worker pool, and emits a
structured JSON report at scripts/_deep_dive_link_report.json plus a
human-readable summary on stdout.
Background:
The per-question `deep_dive_url` field was removed during the vault
migration (Phase 1). StaffML now links to textbook chapters via this
manifest. Topic-granular linking is a separate, deferred design
(see interviews/vault/BOOK_LINKING_PLAN.md). Until that ships, the
chapter-URL manifest IS the user-facing link surface — probing it keeps
us honest about chapter-level link health.
Usage:
python3 scripts/check-deep-dive-links.py # full check
python3 scripts/check-deep-dive-links.py --hosts mlsysbook.ai
python3 scripts/check-deep-dive-links.py --fail-on-broken # exit 1 if any URL is dead
Output report shape (keys stable for the workflow to parse):
{
"checked_at": "2026-04-16T18:42:00Z",
"total_links": 27,
"unique_urls": 27,
"by_status": { "200": 27, "404": 0, ... },
"by_host": { "mlsysbook.ai": { "200": 27 }, ... },
"broken": [ { "url": "...", "status": 404, "occurrences": 1 }, ... ]
}
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import os
import sys
import time
from collections import Counter, defaultdict
from pathlib import Path
from urllib.parse import urlparse
import subprocess
import shutil
if shutil.which("curl") is None:
print("FATAL: curl is required (sudo apt install curl / brew install curl)", file=sys.stderr)
sys.exit(2)
# ───────────────────────── Config ──────────────────────────
TIMEOUT_SECONDS = 6
MAX_WORKERS = 8
USER_AGENT = "StaffML-LinkChecker/1.0 (+https://staffml.ai)"
BASE_URL = "https://mlsysbook.ai"
SOURCE_PATH = Path(__file__).resolve().parent.parent / "src" / "data" / "chapter-urls.json"
REPORT_PATH = Path(__file__).resolve().parent / "_deep_dive_link_report.json"
# Hosts we know are broken (mark in report but don't even try to probe to save time)
KNOWN_DEAD_HOSTS = {
"harvard-edge.github.io",
}
# ───────────────────── Probing logic ───────────────────────
def probe_url(url: str) -> dict:
"""Return {status, host} for a single URL via curl.
Uses HEAD (-I -L --head) with --location-trusted to follow redirects.
Returns the final HTTP status code, or a sentinel string like
'timeout' / 'dns' / 'tls' / 'invalid' / 'curl-fail'.
"""
parsed = urlparse(url)
host = parsed.hostname or ""
if host in KNOWN_DEAD_HOSTS:
return {"status": "known-dead", "host": host}
if parsed.scheme not in ("http", "https"):
return {"status": "invalid-scheme", "host": host}
try:
result = subprocess.run(
[
"curl",
"-sL", # silent + follow redirects
"-o", os.devnull,
"-A", USER_AGENT,
"--max-time", str(TIMEOUT_SECONDS),
"-w", "%{http_code}",
url,
],
capture_output=True,
text=True,
timeout=TIMEOUT_SECONDS + 2,
)
except subprocess.TimeoutExpired:
return {"status": "timeout", "host": host}
except Exception as e:
return {"status": f"error: {type(e).__name__}", "host": host}
if result.returncode != 0:
# curl error code -> sentinel
# https://curl.se/libcurl/c/libcurl-errors.html
stderr_lower = (result.stderr or "").lower()
if result.returncode == 6 or "could not resolve" in stderr_lower:
return {"status": "dns", "host": host}
if result.returncode == 28:
return {"status": "timeout", "host": host}
if result.returncode in (35, 60):
return {"status": "tls", "host": host}
return {"status": f"curl-fail-{result.returncode}", "host": host}
code_str = (result.stdout or "").strip()
if not code_str.isdigit():
return {"status": "no-status", "host": host}
return {"status": int(code_str), "host": host}
# ───────────────────── Manifest walking ────────────────────
def collect_urls(source_path: Path) -> dict[str, int]:
"""Return {url: occurrence_count} from the chapter-url manifest.
chapter-urls.json is a flat {chapter_id: relative_path} dict. Each entry
is one user-facing destination, so occurrences=1 per URL. The relative
path is joined with BASE_URL to form the probe target.
"""
with source_path.open() as f:
data = json.load(f)
if not isinstance(data, dict):
raise SystemExit(
f"Expected a flat dict in {source_path}, got {type(data).__name__}"
)
counts: Counter[str] = Counter()
for chapter_id, rel_path in data.items():
if not isinstance(rel_path, str) or not rel_path:
continue
# Relative paths are absolute under the site root (start with '/'),
# so a simple concatenation with BASE_URL is correct.
url = BASE_URL.rstrip("/") + "/" + rel_path.lstrip("/")
counts[url] += 1
return dict(counts)
# ─────────────────────── Main flow ─────────────────────────
def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description="Check StaffML corpus deep_dive_url health.")
parser.add_argument("--hosts", nargs="*", default=None,
help="Only probe URLs whose host is in this allowlist.")
parser.add_argument("--fail-on-broken", action="store_true",
help="Exit with code 1 if any URL is dead (status >= 400 or sentinel).")
parser.add_argument("--quiet", action="store_true", help="Suppress per-URL progress.")
args = parser.parse_args(argv)
if not SOURCE_PATH.exists():
print(f"FATAL: chapter-url manifest not found at {SOURCE_PATH}", file=sys.stderr)
return 2
print(f"Loading chapter-url manifest from {SOURCE_PATH}")
occurrences = collect_urls(SOURCE_PATH)
total_links = sum(occurrences.values())
unique_urls = list(occurrences.keys())
if args.hosts:
allow = set(args.hosts)
unique_urls = [u for u in unique_urls if (urlparse(u).hostname or "") in allow]
print(f"Filtered by hosts {sorted(allow)}: {len(unique_urls)} URLs to probe.")
print(f"Found {total_links} manifest entries → {len(occurrences)} unique URLs")
if args.hosts:
print(f"Probing {len(unique_urls)} after host filter")
else:
print(f"Probing {len(unique_urls)} unique URLs (HEAD with GET fallback, "
f"timeout {TIMEOUT_SECONDS}s, {MAX_WORKERS} workers)")
started = time.time()
results: dict[str, dict] = {}
completed = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
future_to_url = {ex.submit(probe_url, u): u for u in unique_urls}
for fut in concurrent.futures.as_completed(future_to_url):
url = future_to_url[fut]
try:
results[url] = fut.result()
except Exception as e:
results[url] = {"status": f"exception: {type(e).__name__}", "host": urlparse(url).hostname or ""}
completed += 1
if not args.quiet and completed % 25 == 0:
print(f" ... {completed}/{len(unique_urls)} probed", file=sys.stderr)
elapsed = time.time() - started
# ────────── Aggregation ──────────
by_status: Counter[str] = Counter()
by_host: dict[str, Counter[str]] = defaultdict(Counter)
broken = []
SUCCESS_CODES = {200, 201, 204, 301, 302, 303, 307, 308}
for url, info in results.items():
status = info.get("status")
host = info.get("host", "")
status_str = str(status)
by_status[status_str] += 1
by_host[host][status_str] += 1
# Broken = anything that isn't a 2xx/3xx success code.
# Sentinel strings (timeout/dns/tls/known-dead/...) all count as broken.
is_success = isinstance(status, int) and status in SUCCESS_CODES
if not is_success:
broken.append({
"url": url,
"status": status,
"host": host,
"occurrences": occurrences.get(url, 0),
})
broken.sort(key=lambda r: -r["occurrences"])
report = {
"checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"elapsed_seconds": round(elapsed, 1),
"total_links": total_links,
"unique_urls": len(occurrences),
"probed": len(unique_urls),
"by_status": dict(by_status),
"by_host": {h: dict(c) for h, c in sorted(by_host.items())},
"broken_count": len(broken),
"broken": broken,
}
REPORT_PATH.write_text(json.dumps(report, indent=2))
print(f"\nReport written to {REPORT_PATH}")
# ────────── Human summary ──────────
print(f"\n=== Summary ({elapsed:.1f}s) ===")
print(f"Manifest entries: {total_links}")
print(f"Unique URLs: {len(occurrences)}")
print(f"Probed: {len(unique_urls)}")
print(f"\nBy status:")
for s, n in sorted(by_status.items(), key=lambda kv: -kv[1]):
print(f" {s:>14} {n}")
print(f"\nTop 10 broken hosts (by unique URL count):")
host_broken = sorted(
[(h, sum(n for s, n in cs.items() if s not in ("200", "301", "302", "303", "307", "308"))) for h, cs in by_host.items()],
key=lambda kv: -kv[1],
)[:10]
for h, n in host_broken:
if n:
print(f" {h:>40} {n} broken")
print(f"\nTop 10 broken URLs (by user-impact = occurrence count):")
for b in broken[:10]:
print(f" [{b['status']}] x{b['occurrences']:<4} {b['url'][:90]}")
if args.fail_on_broken and broken:
print(f"\n{len(broken)} broken URLs — exiting 1", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@@ -1,29 +0,0 @@
{
"vol1_benchmarking": "/contents/vol1/benchmarking/benchmarking.html",
"vol1_data_engineering": "/contents/vol1/data_engineering/data_engineering.html",
"vol1_frameworks": "/contents/vol1/frameworks/frameworks.html",
"vol1_hw_acceleration": "/contents/vol1/hw_acceleration/hw_acceleration.html",
"vol1_ml_ops": "/contents/vol1/ml_ops/ml_ops.html",
"vol1_ml_systems": "/contents/vol1/ml_systems/ml_systems.html",
"vol1_ml_workflow": "/contents/vol1/ml_workflow/ml_workflow.html",
"vol1_model_serving": "/contents/vol1/model_serving/model_serving.html",
"vol1_nn_architectures": "/contents/vol1/nn_architectures/nn_architectures.html",
"vol1_nn_computation": "/contents/vol1/nn_computation/nn_computation.html",
"vol1_responsible_engr": "/contents/vol1/responsible_engr/responsible_engr.html",
"vol1_training": "/contents/vol1/training/training.html",
"vol2_collective_communication": "/contents/vol2/collective_communication/collective_communication.html",
"vol2_compute_infrastructure": "/contents/vol2/compute_infrastructure/compute_infrastructure.html",
"vol2_data_storage": "/contents/vol2/data_storage/data_storage.html",
"vol2_distributed_training": "/contents/vol2/distributed_training/distributed_training.html",
"vol2_edge_intelligence": "/contents/vol2/edge_intelligence/edge_intelligence.html",
"vol2_fault_tolerance": "/contents/vol2/fault_tolerance/fault_tolerance.html",
"vol2_fleet_orchestration": "/contents/vol2/fleet_orchestration/fleet_orchestration.html",
"vol2_inference": "/contents/vol2/inference/inference.html",
"vol2_network_fabrics": "/contents/vol2/network_fabrics/network_fabrics.html",
"vol2_ops_scale": "/contents/vol2/ops_scale/ops_scale.html",
"vol2_performance_engineering": "/contents/vol2/performance_engineering/performance_engineering.html",
"vol2_responsible_ai": "/contents/vol2/responsible_ai/responsible_ai.html",
"vol2_robust_ai": "/contents/vol2/robust_ai/robust_ai.html",
"vol2_security_privacy": "/contents/vol2/security_privacy/security_privacy.html",
"vol2_sustainable_ai": "/contents/vol2/sustainable_ai/sustainable_ai.html"
}