mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 09:38:33 -05:00
chore(staffml): remove chapter-url manifest and link-probe infrastructure
Removes the last active coupling between StaffML questions and the
mlsysbook.ai site:
Deleted files
=============
- interviews/staffml/src/data/chapter-urls.json
27-entry chapter-id → relative-path map. All 27 URLs currently 404
against production because the live site serves /contents/core/...
while the manifest uses /contents/vol1|vol2/... paths.
- interviews/staffml/scripts/check-deep-dive-links.py
Weekly URL-health probe that walked chapter-urls.json. Nothing else
consumes it; its sole SOURCE_PATH was the manifest above.
- .github/workflows/staffml-link-check.yml
Scheduled CI (cron '0 9 * * 1') + PR-comment + auto-issue-filing
pipeline for the probe. With the probe gone, the workflow had no
job left. Grep confirmed no other workflow depends on its
'staffml-link-report' artifact name.
Modified
========
- interviews/staffml/scripts/DEPRECATED.md
Drop the 'check-deep-dive-links.py' row (script no longer exists
so the replacement pointer is no longer meaningful).
- interviews/staffml/.gitignore
Drop the '_deep_dive_link_report.json' ignore (the file that
produced it is gone).
What replaces this
==================
Nothing yet. Per the resources-list model adopted in the preceding
commits, per-question book links are an author-curated editorial
act — authors add { name, url } entries to Details.resources when
book URLs stabilize (mlsysbook.ai/vol1 still moving). Until then,
StaffML is deliberately self-contained for book-linking purposes.
Ecosystem-level cross-linking to the book remains via Nav.tsx's
existing 'MLSysBook.ai' header link (stable, points at homepage);
a more prominent affordance is planned for a follow-up commit.
This commit is contained in:
250
.github/workflows/staffml-link-check.yml
vendored
250
.github/workflows/staffml-link-check.yml
vendored
@@ -1,250 +0,0 @@
|
||||
name: '🎯 StaffML · 🔗 Link Check'
|
||||
|
||||
# =============================================================================
|
||||
# StaffML — Chapter-URL manifest health check
|
||||
# =============================================================================
|
||||
#
|
||||
# Probes every unique URL in src/data/chapter-urls.json (joined with
|
||||
# mlsysbook.ai) via curl and uploads a structured JSON health report. Runs on
|
||||
# a weekly schedule and on manual dispatch. Optionally fails the workflow if
|
||||
# any new URLs are dead.
|
||||
#
|
||||
# Flow:
|
||||
# 1. CHECKOUT — minimal clone, no deps
|
||||
# 2. PROBE — python3 scripts/check-deep-dive-links.py
|
||||
# 3. UPLOAD — _deep_dive_link_report.json as workflow artifact
|
||||
# 4. NOTIFY — open an issue if broken-count regresses week-over-week
|
||||
#
|
||||
# Triggers:
|
||||
# - schedule: every Monday 09:00 UTC
|
||||
# - workflow_dispatch: manual run
|
||||
# - pull_request: only when chapter-urls.json, refs.ts, or the
|
||||
# probe itself is touched
|
||||
#
|
||||
# Related:
|
||||
# - interviews/staffml/scripts/check-deep-dive-links.py
|
||||
# - interviews/staffml/src/lib/refs.ts (consumer of chapter-urls.json)
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# Mondays at 09:00 UTC — early enough to catch the weekend's regressions
|
||||
- cron: '0 9 * * 1'
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
fail_on_broken:
|
||||
description: 'Fail the workflow if any URL is dead'
|
||||
required: false
|
||||
default: 'false'
|
||||
type: choice
|
||||
options: ['true', 'false']
|
||||
pull_request:
|
||||
paths:
|
||||
- 'interviews/staffml/src/data/chapter-urls.json'
|
||||
- 'interviews/staffml/src/lib/refs.ts'
|
||||
- 'interviews/staffml/scripts/check-deep-dive-links.py'
|
||||
- '.github/workflows/staffml-link-check.yml'
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
issues: write # for the scheduled regression-notify step (opens/updates an issue)
|
||||
pull-requests: write # for the PR-comment step on pull_request events
|
||||
|
||||
concurrency:
|
||||
group: staffml-link-check
|
||||
cancel-in-progress: false # let scheduled runs always finish
|
||||
|
||||
jobs:
|
||||
check-links:
|
||||
name: '🔗 Probe corpus deep_dive_urls'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 15
|
||||
|
||||
steps:
|
||||
- name: 📥 Checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
# Minimal clone — we only need the corpus and the script
|
||||
fetch-depth: 1
|
||||
|
||||
- name: 🐍 Verify Python
|
||||
run: python3 --version
|
||||
|
||||
- name: 🔍 Verify curl
|
||||
run: curl --version
|
||||
|
||||
- name: 🌐 Run link checker
|
||||
id: probe
|
||||
working-directory: interviews/staffml
|
||||
run: |
|
||||
set +e
|
||||
python3 scripts/check-deep-dive-links.py --quiet | tee /tmp/link-check.log
|
||||
rc=$?
|
||||
set -e
|
||||
|
||||
if [ ! -f scripts/_deep_dive_link_report.json ]; then
|
||||
echo "❌ Link checker did not produce a report"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract summary metrics for downstream steps
|
||||
eval "$(python3 - <<'PY'
|
||||
import json
|
||||
with open('scripts/_deep_dive_link_report.json') as f:
|
||||
r = json.load(f)
|
||||
total = r.get('total_links', 0)
|
||||
unique = r.get('unique_urls', 0)
|
||||
broken = r.get('broken_count', 0)
|
||||
healthy = unique - broken
|
||||
pct = (healthy / unique * 100) if unique else 0
|
||||
print(f"TOTAL_LINKS={total}")
|
||||
print(f"UNIQUE_URLS={unique}")
|
||||
print(f"BROKEN_COUNT={broken}")
|
||||
print(f"HEALTHY_COUNT={healthy}")
|
||||
print(f"HEALTH_PCT={pct:.1f}")
|
||||
PY
|
||||
)"
|
||||
|
||||
# Publish to step outputs for downstream steps
|
||||
{
|
||||
echo "total_links=$TOTAL_LINKS"
|
||||
echo "unique_urls=$UNIQUE_URLS"
|
||||
echo "broken_count=$BROKEN_COUNT"
|
||||
echo "healthy_count=$HEALTHY_COUNT"
|
||||
echo "health_pct=$HEALTH_PCT"
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
|
||||
echo ""
|
||||
echo "─── Link health summary ───"
|
||||
echo "Total references : $TOTAL_LINKS"
|
||||
echo "Unique URLs : $UNIQUE_URLS"
|
||||
echo "Healthy : $HEALTHY_COUNT ($HEALTH_PCT%)"
|
||||
echo "Broken : $BROKEN_COUNT"
|
||||
|
||||
# Honor the manual fail-on-broken input
|
||||
if [ "${{ github.event.inputs.fail_on_broken }}" = "true" ] && [ "$rc" -ne 0 ]; then
|
||||
echo "❌ Manual fail-on-broken requested and broken URLs found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: 📤 Upload report artifact
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: staffml-link-report
|
||||
path: interviews/staffml/scripts/_deep_dive_link_report.json
|
||||
retention-days: 90
|
||||
|
||||
- name: 📊 PR comment with link health diff
|
||||
if: github.event_name == 'pull_request'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
const report = JSON.parse(
|
||||
fs.readFileSync('interviews/staffml/scripts/_deep_dive_link_report.json', 'utf8')
|
||||
);
|
||||
const total = report.total_links;
|
||||
const unique = report.unique_urls;
|
||||
const broken = report.broken_count;
|
||||
const healthy = unique - broken;
|
||||
const pct = unique ? ((healthy / unique) * 100).toFixed(1) : '0.0';
|
||||
|
||||
// Top 5 broken-by-impact, surfaced inline
|
||||
const top = (report.broken || [])
|
||||
.slice(0, 5)
|
||||
.map(b => `| ${b.status} | ${b.occurrences} | \`${b.url.slice(0, 80)}\` |`)
|
||||
.join('\n');
|
||||
|
||||
const body = [
|
||||
'## 🔗 StaffML link-health report',
|
||||
'',
|
||||
`- **Total references**: ${total}`,
|
||||
`- **Unique URLs**: ${unique}`,
|
||||
`- **Healthy (2xx/3xx)**: ${healthy} (${pct}%)`,
|
||||
`- **Broken**: ${broken}`,
|
||||
'',
|
||||
top ? '### Top 5 broken URLs by user-impact' : '',
|
||||
top ? '| Status | Occurrences | URL |' : '',
|
||||
top ? '|---|---|---|' : '',
|
||||
top,
|
||||
'',
|
||||
'> Full report uploaded as workflow artifact `staffml-link-report`.',
|
||||
].filter(Boolean).join('\n');
|
||||
|
||||
github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: context.issue.number,
|
||||
body,
|
||||
});
|
||||
|
||||
- name: 🚨 Open issue on regression
|
||||
# Only on scheduled runs — we want a single source of weekly truth
|
||||
if: github.event_name == 'schedule'
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const fs = require('fs');
|
||||
const report = JSON.parse(
|
||||
fs.readFileSync('interviews/staffml/scripts/_deep_dive_link_report.json', 'utf8')
|
||||
);
|
||||
const broken = report.broken_count || 0;
|
||||
const unique = report.unique_urls || 0;
|
||||
const healthy = unique - broken;
|
||||
const pct = unique ? ((healthy / unique) * 100).toFixed(1) : '0.0';
|
||||
|
||||
// Only file an issue if health < 60% — adjust as the corpus heals
|
||||
const HEALTH_THRESHOLD = 60.0;
|
||||
if (parseFloat(pct) >= HEALTH_THRESHOLD) {
|
||||
console.log(`Health ${pct}% >= threshold ${HEALTH_THRESHOLD}%, no issue filed.`);
|
||||
return;
|
||||
}
|
||||
|
||||
const top = (report.broken || [])
|
||||
.slice(0, 10)
|
||||
.map(b => `- [${b.status}] x${b.occurrences} \`${b.url}\``)
|
||||
.join('\n');
|
||||
|
||||
const title = `[StaffML] Link health ${pct}% (${broken}/${unique} broken)`;
|
||||
const body = [
|
||||
`## Weekly StaffML link-health report`,
|
||||
'',
|
||||
`- **Healthy**: ${healthy}/${unique} (${pct}%)`,
|
||||
`- **Broken**: ${broken}`,
|
||||
`- **Threshold**: ${HEALTH_THRESHOLD}%`,
|
||||
'',
|
||||
`### Top 10 broken URLs by user-impact`,
|
||||
top,
|
||||
'',
|
||||
`Full report attached as artifact \`staffml-link-report\` on the [workflow run](${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}).`,
|
||||
'',
|
||||
`_Auto-filed by \`.github/workflows/staffml-link-check.yml\`._`,
|
||||
].join('\n');
|
||||
|
||||
// Avoid duplicates: look for an open issue with the same title prefix
|
||||
const existing = await github.rest.issues.listForRepo({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
state: 'open',
|
||||
labels: 'staffml,link-health',
|
||||
});
|
||||
const dup = existing.data.find(i => i.title.startsWith('[StaffML] Link health'));
|
||||
if (dup) {
|
||||
console.log(`Updating existing issue #${dup.number}`);
|
||||
await github.rest.issues.createComment({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
issue_number: dup.number,
|
||||
body,
|
||||
});
|
||||
} else {
|
||||
await github.rest.issues.create({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
title,
|
||||
body,
|
||||
labels: ['staffml', 'link-health'],
|
||||
});
|
||||
}
|
||||
3
interviews/staffml/.gitignore
vendored
3
interviews/staffml/.gitignore
vendored
@@ -5,9 +5,6 @@ out/
|
||||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Link-checker report (generated by scripts/check-deep-dive-links.py)
|
||||
scripts/_deep_dive_link_report.json
|
||||
|
||||
# Cloudflare Wrangler cache (worker dev/deploy)
|
||||
worker/.wrangler/
|
||||
worker/wrangler.toml.bak
|
||||
|
||||
@@ -12,7 +12,6 @@ artifact) or pushed data into `src/data/corpus.json` (now emitted by
|
||||
| `sync-vault.py` | Copied vault/corpus.json → src/data/ with filter | `vault build --legacy-json` emits site-compatible JSON directly |
|
||||
| `generate-manifest.py` | Built src/data/vault-manifest.json | Built by `vault publish` as a release artifact |
|
||||
| `validate-vault.py` | Sanity check on corpus shape | Covered by `vault check --strict` invariants |
|
||||
| `check-deep-dive-links.py` | URL reachability | `vault check --tier slow` (nightly) |
|
||||
| `format-napkin-math.py` | One-shot formatter | Obsolete |
|
||||
| `sync-periodic-table.mjs` | Unrelated (periodic-table site feature) | Still active — NOT deprecated |
|
||||
|
||||
|
||||
@@ -1,270 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Link checker for the StaffML → textbook chapter-URL manifest.
|
||||
|
||||
Walks src/data/chapter-urls.json (the 27-entry chapter-id → relative-path map
|
||||
consumed by src/lib/refs.ts), prefixes each path with mlsysbook.ai, probes
|
||||
each URL once via curl over a small concurrent worker pool, and emits a
|
||||
structured JSON report at scripts/_deep_dive_link_report.json plus a
|
||||
human-readable summary on stdout.
|
||||
|
||||
Background:
|
||||
The per-question `deep_dive_url` field was removed during the vault
|
||||
migration (Phase 1). StaffML now links to textbook chapters via this
|
||||
manifest. Topic-granular linking is a separate, deferred design
|
||||
(see interviews/vault/BOOK_LINKING_PLAN.md). Until that ships, the
|
||||
chapter-URL manifest IS the user-facing link surface — probing it keeps
|
||||
us honest about chapter-level link health.
|
||||
|
||||
Usage:
|
||||
python3 scripts/check-deep-dive-links.py # full check
|
||||
python3 scripts/check-deep-dive-links.py --hosts mlsysbook.ai
|
||||
python3 scripts/check-deep-dive-links.py --fail-on-broken # exit 1 if any URL is dead
|
||||
|
||||
Output report shape (keys stable for the workflow to parse):
|
||||
{
|
||||
"checked_at": "2026-04-16T18:42:00Z",
|
||||
"total_links": 27,
|
||||
"unique_urls": 27,
|
||||
"by_status": { "200": 27, "404": 0, ... },
|
||||
"by_host": { "mlsysbook.ai": { "200": 27 }, ... },
|
||||
"broken": [ { "url": "...", "status": 404, "occurrences": 1 }, ... ]
|
||||
}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import subprocess
|
||||
import shutil
|
||||
|
||||
if shutil.which("curl") is None:
|
||||
print("FATAL: curl is required (sudo apt install curl / brew install curl)", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
|
||||
# ───────────────────────── Config ──────────────────────────
|
||||
TIMEOUT_SECONDS = 6
|
||||
MAX_WORKERS = 8
|
||||
USER_AGENT = "StaffML-LinkChecker/1.0 (+https://staffml.ai)"
|
||||
BASE_URL = "https://mlsysbook.ai"
|
||||
SOURCE_PATH = Path(__file__).resolve().parent.parent / "src" / "data" / "chapter-urls.json"
|
||||
REPORT_PATH = Path(__file__).resolve().parent / "_deep_dive_link_report.json"
|
||||
|
||||
# Hosts we know are broken (mark in report but don't even try to probe to save time)
|
||||
KNOWN_DEAD_HOSTS = {
|
||||
"harvard-edge.github.io",
|
||||
}
|
||||
|
||||
|
||||
# ───────────────────── Probing logic ───────────────────────
|
||||
def probe_url(url: str) -> dict:
|
||||
"""Return {status, host} for a single URL via curl.
|
||||
|
||||
Uses HEAD (-I -L --head) with --location-trusted to follow redirects.
|
||||
Returns the final HTTP status code, or a sentinel string like
|
||||
'timeout' / 'dns' / 'tls' / 'invalid' / 'curl-fail'.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
host = parsed.hostname or ""
|
||||
|
||||
if host in KNOWN_DEAD_HOSTS:
|
||||
return {"status": "known-dead", "host": host}
|
||||
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
return {"status": "invalid-scheme", "host": host}
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"curl",
|
||||
"-sL", # silent + follow redirects
|
||||
"-o", os.devnull,
|
||||
"-A", USER_AGENT,
|
||||
"--max-time", str(TIMEOUT_SECONDS),
|
||||
"-w", "%{http_code}",
|
||||
url,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=TIMEOUT_SECONDS + 2,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"status": "timeout", "host": host}
|
||||
except Exception as e:
|
||||
return {"status": f"error: {type(e).__name__}", "host": host}
|
||||
|
||||
if result.returncode != 0:
|
||||
# curl error code -> sentinel
|
||||
# https://curl.se/libcurl/c/libcurl-errors.html
|
||||
stderr_lower = (result.stderr or "").lower()
|
||||
if result.returncode == 6 or "could not resolve" in stderr_lower:
|
||||
return {"status": "dns", "host": host}
|
||||
if result.returncode == 28:
|
||||
return {"status": "timeout", "host": host}
|
||||
if result.returncode in (35, 60):
|
||||
return {"status": "tls", "host": host}
|
||||
return {"status": f"curl-fail-{result.returncode}", "host": host}
|
||||
|
||||
code_str = (result.stdout or "").strip()
|
||||
if not code_str.isdigit():
|
||||
return {"status": "no-status", "host": host}
|
||||
return {"status": int(code_str), "host": host}
|
||||
|
||||
|
||||
# ───────────────────── Manifest walking ────────────────────
|
||||
def collect_urls(source_path: Path) -> dict[str, int]:
|
||||
"""Return {url: occurrence_count} from the chapter-url manifest.
|
||||
|
||||
chapter-urls.json is a flat {chapter_id: relative_path} dict. Each entry
|
||||
is one user-facing destination, so occurrences=1 per URL. The relative
|
||||
path is joined with BASE_URL to form the probe target.
|
||||
"""
|
||||
with source_path.open() as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not isinstance(data, dict):
|
||||
raise SystemExit(
|
||||
f"Expected a flat dict in {source_path}, got {type(data).__name__}"
|
||||
)
|
||||
|
||||
counts: Counter[str] = Counter()
|
||||
for chapter_id, rel_path in data.items():
|
||||
if not isinstance(rel_path, str) or not rel_path:
|
||||
continue
|
||||
# Relative paths are absolute under the site root (start with '/'),
|
||||
# so a simple concatenation with BASE_URL is correct.
|
||||
url = BASE_URL.rstrip("/") + "/" + rel_path.lstrip("/")
|
||||
counts[url] += 1
|
||||
return dict(counts)
|
||||
|
||||
|
||||
# ─────────────────────── Main flow ─────────────────────────
|
||||
def main(argv: list[str]) -> int:
|
||||
parser = argparse.ArgumentParser(description="Check StaffML corpus deep_dive_url health.")
|
||||
parser.add_argument("--hosts", nargs="*", default=None,
|
||||
help="Only probe URLs whose host is in this allowlist.")
|
||||
parser.add_argument("--fail-on-broken", action="store_true",
|
||||
help="Exit with code 1 if any URL is dead (status >= 400 or sentinel).")
|
||||
parser.add_argument("--quiet", action="store_true", help="Suppress per-URL progress.")
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if not SOURCE_PATH.exists():
|
||||
print(f"FATAL: chapter-url manifest not found at {SOURCE_PATH}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
print(f"Loading chapter-url manifest from {SOURCE_PATH}")
|
||||
occurrences = collect_urls(SOURCE_PATH)
|
||||
total_links = sum(occurrences.values())
|
||||
unique_urls = list(occurrences.keys())
|
||||
|
||||
if args.hosts:
|
||||
allow = set(args.hosts)
|
||||
unique_urls = [u for u in unique_urls if (urlparse(u).hostname or "") in allow]
|
||||
print(f"Filtered by hosts {sorted(allow)}: {len(unique_urls)} URLs to probe.")
|
||||
|
||||
print(f"Found {total_links} manifest entries → {len(occurrences)} unique URLs")
|
||||
if args.hosts:
|
||||
print(f"Probing {len(unique_urls)} after host filter")
|
||||
else:
|
||||
print(f"Probing {len(unique_urls)} unique URLs (HEAD with GET fallback, "
|
||||
f"timeout {TIMEOUT_SECONDS}s, {MAX_WORKERS} workers)")
|
||||
|
||||
started = time.time()
|
||||
results: dict[str, dict] = {}
|
||||
completed = 0
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
|
||||
future_to_url = {ex.submit(probe_url, u): u for u in unique_urls}
|
||||
for fut in concurrent.futures.as_completed(future_to_url):
|
||||
url = future_to_url[fut]
|
||||
try:
|
||||
results[url] = fut.result()
|
||||
except Exception as e:
|
||||
results[url] = {"status": f"exception: {type(e).__name__}", "host": urlparse(url).hostname or ""}
|
||||
completed += 1
|
||||
if not args.quiet and completed % 25 == 0:
|
||||
print(f" ... {completed}/{len(unique_urls)} probed", file=sys.stderr)
|
||||
|
||||
elapsed = time.time() - started
|
||||
|
||||
# ────────── Aggregation ──────────
|
||||
by_status: Counter[str] = Counter()
|
||||
by_host: dict[str, Counter[str]] = defaultdict(Counter)
|
||||
broken = []
|
||||
|
||||
SUCCESS_CODES = {200, 201, 204, 301, 302, 303, 307, 308}
|
||||
|
||||
for url, info in results.items():
|
||||
status = info.get("status")
|
||||
host = info.get("host", "")
|
||||
status_str = str(status)
|
||||
by_status[status_str] += 1
|
||||
by_host[host][status_str] += 1
|
||||
|
||||
# Broken = anything that isn't a 2xx/3xx success code.
|
||||
# Sentinel strings (timeout/dns/tls/known-dead/...) all count as broken.
|
||||
is_success = isinstance(status, int) and status in SUCCESS_CODES
|
||||
if not is_success:
|
||||
broken.append({
|
||||
"url": url,
|
||||
"status": status,
|
||||
"host": host,
|
||||
"occurrences": occurrences.get(url, 0),
|
||||
})
|
||||
|
||||
broken.sort(key=lambda r: -r["occurrences"])
|
||||
|
||||
report = {
|
||||
"checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"elapsed_seconds": round(elapsed, 1),
|
||||
"total_links": total_links,
|
||||
"unique_urls": len(occurrences),
|
||||
"probed": len(unique_urls),
|
||||
"by_status": dict(by_status),
|
||||
"by_host": {h: dict(c) for h, c in sorted(by_host.items())},
|
||||
"broken_count": len(broken),
|
||||
"broken": broken,
|
||||
}
|
||||
|
||||
REPORT_PATH.write_text(json.dumps(report, indent=2))
|
||||
print(f"\nReport written to {REPORT_PATH}")
|
||||
|
||||
# ────────── Human summary ──────────
|
||||
print(f"\n=== Summary ({elapsed:.1f}s) ===")
|
||||
print(f"Manifest entries: {total_links}")
|
||||
print(f"Unique URLs: {len(occurrences)}")
|
||||
print(f"Probed: {len(unique_urls)}")
|
||||
print(f"\nBy status:")
|
||||
for s, n in sorted(by_status.items(), key=lambda kv: -kv[1]):
|
||||
print(f" {s:>14} {n}")
|
||||
|
||||
print(f"\nTop 10 broken hosts (by unique URL count):")
|
||||
host_broken = sorted(
|
||||
[(h, sum(n for s, n in cs.items() if s not in ("200", "301", "302", "303", "307", "308"))) for h, cs in by_host.items()],
|
||||
key=lambda kv: -kv[1],
|
||||
)[:10]
|
||||
for h, n in host_broken:
|
||||
if n:
|
||||
print(f" {h:>40} {n} broken")
|
||||
|
||||
print(f"\nTop 10 broken URLs (by user-impact = occurrence count):")
|
||||
for b in broken[:10]:
|
||||
print(f" [{b['status']}] x{b['occurrences']:<4} {b['url'][:90]}")
|
||||
|
||||
if args.fail_on_broken and broken:
|
||||
print(f"\n❌ {len(broken)} broken URLs — exiting 1", file=sys.stderr)
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
@@ -1,29 +0,0 @@
|
||||
{
|
||||
"vol1_benchmarking": "/contents/vol1/benchmarking/benchmarking.html",
|
||||
"vol1_data_engineering": "/contents/vol1/data_engineering/data_engineering.html",
|
||||
"vol1_frameworks": "/contents/vol1/frameworks/frameworks.html",
|
||||
"vol1_hw_acceleration": "/contents/vol1/hw_acceleration/hw_acceleration.html",
|
||||
"vol1_ml_ops": "/contents/vol1/ml_ops/ml_ops.html",
|
||||
"vol1_ml_systems": "/contents/vol1/ml_systems/ml_systems.html",
|
||||
"vol1_ml_workflow": "/contents/vol1/ml_workflow/ml_workflow.html",
|
||||
"vol1_model_serving": "/contents/vol1/model_serving/model_serving.html",
|
||||
"vol1_nn_architectures": "/contents/vol1/nn_architectures/nn_architectures.html",
|
||||
"vol1_nn_computation": "/contents/vol1/nn_computation/nn_computation.html",
|
||||
"vol1_responsible_engr": "/contents/vol1/responsible_engr/responsible_engr.html",
|
||||
"vol1_training": "/contents/vol1/training/training.html",
|
||||
"vol2_collective_communication": "/contents/vol2/collective_communication/collective_communication.html",
|
||||
"vol2_compute_infrastructure": "/contents/vol2/compute_infrastructure/compute_infrastructure.html",
|
||||
"vol2_data_storage": "/contents/vol2/data_storage/data_storage.html",
|
||||
"vol2_distributed_training": "/contents/vol2/distributed_training/distributed_training.html",
|
||||
"vol2_edge_intelligence": "/contents/vol2/edge_intelligence/edge_intelligence.html",
|
||||
"vol2_fault_tolerance": "/contents/vol2/fault_tolerance/fault_tolerance.html",
|
||||
"vol2_fleet_orchestration": "/contents/vol2/fleet_orchestration/fleet_orchestration.html",
|
||||
"vol2_inference": "/contents/vol2/inference/inference.html",
|
||||
"vol2_network_fabrics": "/contents/vol2/network_fabrics/network_fabrics.html",
|
||||
"vol2_ops_scale": "/contents/vol2/ops_scale/ops_scale.html",
|
||||
"vol2_performance_engineering": "/contents/vol2/performance_engineering/performance_engineering.html",
|
||||
"vol2_responsible_ai": "/contents/vol2/responsible_ai/responsible_ai.html",
|
||||
"vol2_robust_ai": "/contents/vol2/robust_ai/robust_ai.html",
|
||||
"vol2_security_privacy": "/contents/vol2/security_privacy/security_privacy.html",
|
||||
"vol2_sustainable_ai": "/contents/vol2/sustainable_ai/sustainable_ai.html"
|
||||
}
|
||||
Reference in New Issue
Block a user