cs249r_book/shared/scripts/build-redirects.py

#!/usr/bin/env python3
"""Generate redirect HTML stubs (and a Netlify _redirects file) from the
shared redirect-map.

Why this script exists
----------------------
GitHub Pages doesn't honor server-side redirects. To preserve SEO juice
from the legacy mlsysbook.ai URLs after the staged rollout, we emit one
tiny HTML file per legacy path:

  <meta http-equiv="refresh" content="0;url=<to>">
  <link rel="canonical" href="<to>">
  <meta name="robots" content="noindex,follow">

Crawlers treat the canonical as authoritative, drop the legacy URL on
recrawl (the noindex), and follow the link graph through to the new
location. Real users hit the meta-refresh and arrive in <100ms.

The same map ALSO produces a Netlify-format `_redirects` file so that if
we ever move off GitHub Pages to a host that supports real 301s, the
existing redirect map drives that day-one without a second source of
truth.

Usage
-----
  build-redirects.py --map shared/config/redirect-map.json \
                     --out gh-pages-staging/ \
                     [--base-url https://mlsysbook.ai] \
                     [--check]

  --check  Validates the JSON without writing any files (CI-friendly).
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

DEFAULT_BASE_URL = "https://mlsysbook.ai"

STUB_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="refresh" content="0;url={dest}">
<link rel="canonical" href="{canonical}">
<meta name="robots" content="noindex,follow">
<title>Redirecting…</title>
</head>
<body>
<p>This page has moved to <a href="{dest}">{dest}</a>.</p>
</body>
</html>
"""


def resolve_dest(to: str, base_url: str) -> str:
    """Return an absolute URL for `to`. If `to` is already absolute, pass
    through. If it's a path, resolve against base_url."""
    if to.startswith(("http://", "https://")):
        return to
    if not to.startswith("/"):
        to = "/" + to
    return base_url.rstrip("/") + to


def validate_entry(i: int, entry: dict[str, Any]) -> list[str]:
    """Return a list of validation errors for one entry. Empty list = OK."""
    errs: list[str] = []
    where = f"redirects[{i}]"
    for required in ("from", "to"):
        if required not in entry:
            errs.append(f"{where}: missing required field '{required}'")
    src = entry.get("from", "")
    if src and not src.startswith("/"):
        errs.append(f"{where}: 'from' must start with '/' (got {src!r})")
    status = entry.get("status", 301)
    if status not in (301, 302, 307, 308):
        errs.append(f"{where}: 'status' should be 301/302/307/308 (got {status!r})")
    # Wildcard handling: only allowed as a final '*' segment for now.
    if "*" in src and not src.endswith("/*"):
        errs.append(
            f"{where}: wildcard '*' currently only supported as the trailing "
            f"path segment (e.g. '/foo/*'); got {src!r}"
        )
    return errs


def write_html_stub(out_root: Path, src: str, dest_url: str) -> Path:
    """Materialize the redirect at out_root/<src>/index.html (or .html
    file if `src` already names a `.html`)."""
    rel = src.lstrip("/")
    if rel.endswith(".html") or rel.endswith(".htm"):
        target = out_root / rel
    elif rel == "" or rel.endswith("/"):
        target = out_root / rel / "index.html"
    else:
        # path with no extension → emit as both /<rel>/index.html
        target = out_root / rel / "index.html"

    target.parent.mkdir(parents=True, exist_ok=True)
    target.write_text(
        STUB_TEMPLATE.format(dest=dest_url, canonical=dest_url),
        encoding="utf-8",
    )
    return target


def write_netlify_file(out_root: Path, lines: list[str]) -> Path:
    """Emit a Netlify-compatible `_redirects` file alongside the stubs."""
    target = out_root / "_redirects"
    target.write_text(
        "# Generated by shared/scripts/build-redirects.py — do not edit by hand.\n"
        "# Source of truth: shared/config/redirect-map.json\n"
        + "\n".join(lines)
        + "\n",
        encoding="utf-8",
    )
    return target


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--map", required=True, help="Path to redirect-map.json")
    ap.add_argument(
        "--out",
        required=False,
        help="Directory to emit redirect stubs into (typically the staging "
        "copy of gh-pages). Required unless --check is passed.",
    )
    ap.add_argument(
        "--base-url",
        default=DEFAULT_BASE_URL,
        help=f"Base URL for relative 'to' values (default: {DEFAULT_BASE_URL})",
    )
    ap.add_argument(
        "--check",
        action="store_true",
        help="Validate the map only; do not write files.",
    )
    args = ap.parse_args()

    if not args.check and not args.out:
        ap.error("--out is required unless --check is passed")

    map_path = Path(args.map)
    try:
        data = json.loads(map_path.read_text(encoding="utf-8"))
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"❌ Could not load {map_path}: {e}", file=sys.stderr)
        return 2

    redirects = data.get("redirects", [])
    if not isinstance(redirects, list):
        print("❌ 'redirects' must be a list", file=sys.stderr)
        return 2

    errors: list[str] = []
    for i, entry in enumerate(redirects):
        errors.extend(validate_entry(i, entry))

    if errors:
        print("❌ Validation errors:", file=sys.stderr)
        for e in errors:
            print(f"  - {e}", file=sys.stderr)
        return 1

    print(f"✅ Validated {len(redirects)} redirect entries from {map_path}")

    if args.check:
        return 0

    out_root = Path(args.out)
    out_root.mkdir(parents=True, exist_ok=True)

    netlify_lines: list[str] = []
    written = 0
    for entry in redirects:
        src = entry["from"]
        dest = resolve_dest(entry["to"], args.base_url)
        status = entry.get("status", 301)

        netlify_lines.append(f"{src}  {dest}  {status}")

        if "*" in src:
            # We cannot statically expand wildcards — that requires walking
            # the deployed tree. Skip stub emission and rely on the Netlify
            # _redirects line for hosts that support it.
            continue

        target = write_html_stub(out_root, src, dest)
        written += 1
        print(f"  → {src}  ⇒  {dest}  ({target.relative_to(out_root)})")

    write_netlify_file(out_root, netlify_lines)
    print(f"✅ Wrote {written} HTML stub(s) and 1 _redirects file to {out_root}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())