Files
cs249r_book/shared/scripts/check-internal-links.py
Vijay Janapa Reddi 69054ab9bc chore(links): allowlist CI-injected mlsysim-paper.pdf
The link checker fired on three references to mlsysim-paper.pdf in
mlsysim/docs/{whitepaper,for-instructors,index}.qmd. That PDF is
intentionally not committed — the mlsysim-publish-live workflow copies
pdf-artifacts/paper.pdf into mlsysim/docs/mlsysim-paper.pdf at deploy.

Add CI_INJECTED_BASENAMES with mlsysim-paper.pdf so the offline checker
skips it. Match by Path(target).name to handle ./ and ../ variants.
2026-05-06 08:10:02 -04:00

493 lines
18 KiB
Python
Executable File

#!/usr/bin/env python3
"""Tier 1 link checker: validate internal Markdown / Quarto links offline.
Scope on purpose:
- Validate ONLY relative-path links and same-file anchor links inside
`.md` / `.qmd` files.
- DO NOT touch external URLs (http/https/mailto/tel/...). External
reachability is Lychee's job in CI; doing it here would make
pre-commit slow and network-flaky.
Why a separate tool from `book/binder`:
- The book toolchain owns Quarto cross-references (`@fig-foo`,
`@sec-bar`), bibliography keys, label hygiene, etc.
- This tool owns plain Markdown link integrity and works repo-wide
(every Quarto site, plus loose READMEs).
Usage:
python3 shared/scripts/check-internal-links.py # check the whole repo
python3 shared/scripts/check-internal-links.py FILE... # check named files
python3 shared/scripts/check-internal-links.py --staged # check git-staged files
python3 shared/scripts/check-internal-links.py --quiet # only print failures
Exit codes:
0 every internal link resolves
1 one or more broken internal links (printed file:line: detail)
2 invocation error
"""
from __future__ import annotations
import argparse
import os
import re
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[2]
# Directories we never validate links in.
EXCLUDE_DIRS = {
".git",
".venv",
"node_modules",
"_site",
"_book",
"_build",
".quarto",
"htmlcov",
"site-packages",
# Per-stage build outputs and vendored extensions:
"_freeze",
"_extensions",
# The dev preview mirror gets very large and is a copy of generated HTML.
"_archive",
}
# File globs we walk when no explicit list is given.
DEFAULT_GLOBS = ("**/*.md", "**/*.qmd")
# Match Markdown inline links: [text](target) and image links: ![alt](target).
# - target may be wrapped in <...> for spaces (rare, but handle it).
# - Not greedy on the [...] part; nested brackets in link text are unusual
# in our content and would need a pure-PEG parser to handle safely.
LINK_RE = re.compile(
r"(?P<bang>!?)\[(?P<text>[^\]]*)\]\((?P<target><[^>]+>|[^)\s]+)(?:\s+\"[^\"]*\")?\)"
)
# Match a fenced-code-block opener/closer: ``` or ~~~ optionally followed by
# an info string. Quarto/Pandoc allow attribute braces (```{python}) too.
FENCE_OPEN_RE = re.compile(r"^(?P<indent>\s{0,3})(?P<fence>`{3,}|~{3,})(?P<info>.*)$")
# Strip inline code spans `...` so a backticked target like `[x](y)` inside
# a paragraph isn't treated as a link.
INLINE_CODE_RE = re.compile(r"`[^`\n]*`")
# Strip HTML comments (single- or multi-line). Authors stash TikZ source,
# review notes, and other non-rendered fragments in `<!-- ... -->` blocks;
# the LINK_RE regex would otherwise match TikZ syntax like
# `\node[mycycle](B1){GPU 1};` as a Markdown link `[mycycle](B1)`.
# Replace each non-newline char with a space to preserve line numbers and
# column offsets, so any *real* failures we report stay accurate.
HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
# Quarto / Pandoc cross-reference targets. These resolve at render time
# against the project-wide cross-ref index, not against the filesystem,
# so the link checker cannot validate them offline. The book toolchain
# (`book-check-references`) owns this validation.
# Examples we must skip: `[See sec](@sec-foo)`, `[Fig](@fig-bar)`,
# `[Tbl](@tbl-baz)`, `[Eq](@eq-qux)`, `[Lst](@lst-x)`.
QUARTO_XREF_PREFIXES = ("@sec-", "@fig-", "@tbl-", "@eq-", "@lst-", "@thm-",
"@cor-", "@def-", "@exr-", "@exm-", "@prp-")
# Artifacts injected by CI at deploy time, not present in the committed tree.
# Match by basename so source links resolve regardless of relative path.
# Matched against `Path(path_part).name`, so subdirectory variants
# (`./mlsysim-paper.pdf`, `../mlsysim-paper.pdf`) are also skipped.
# Source: mlsysim-publish-live.yml copies pdf-artifacts/paper.pdf into
# mlsysim/docs/mlsysim-paper.pdf at deploy.
CI_INJECTED_BASENAMES = {
"mlsysim-paper.pdf",
}
# Match explicit Quarto / Pandoc anchor syntax inside headings:
# ## My Header {#sec-foo}
ANCHOR_ATTR_RE = re.compile(r"\{#(?P<id>[^\s}]+)[^}]*\}")
# Match ATX headings to derive their slugified id.
HEADING_RE = re.compile(r"^(#{1,6})\s+(?P<heading>.+?)\s*$", re.MULTILINE)
# Strip Quarto callouts and cross-ref macros from headings before slugifying.
HEADING_CLEAN_RE = re.compile(r"\{[^}]*\}|`[^`]*`")
# Schemes that are external and out of scope for this checker.
EXTERNAL_SCHEMES = (
"http://",
"https://",
"mailto:",
"tel:",
"ftp://",
"ftps://",
"ssh://",
"git://",
"data:",
"javascript:",
)
@dataclass(frozen=True)
class Problem:
file: Path
line: int
target: str
detail: str
def render(self, root: Path) -> str:
try:
rel = self.file.relative_to(root)
except ValueError:
rel = self.file
return f"{rel}:{self.line}: broken link → {self.target!r} ({self.detail})"
def slugify(heading: str) -> str:
"""Approximate Pandoc's `--id-prefix=section` slug for an ATX heading.
Matches Pandoc's `auto_identifiers` extension well enough for our content:
- Strip leading non-alphanumerics.
- Replace whitespace with single hyphens.
- Lowercase.
- Drop chars that aren't alnum, hyphen, underscore, period, or colon.
"""
cleaned = HEADING_CLEAN_RE.sub("", heading).strip()
# Pandoc strips leading non-letter characters from the slug.
cleaned = re.sub(r"^[^A-Za-z]+", "", cleaned)
cleaned = cleaned.lower()
cleaned = re.sub(r"\s+", "-", cleaned)
cleaned = re.sub(r"[^a-z0-9_\-.:]", "", cleaned)
return cleaned
def github_slugify(heading: str) -> str:
"""Approximate GitHub's heading-anchor slug (used in `.md` READMEs).
GitHub's algorithm differs from Pandoc in two important ways:
- It does NOT strip leading non-letter chars, so emoji-prefixed
headings like `## 🎯 20 Progressive Modules` produce anchors
that start with a hyphen (`-20-progressive-modules`) because
the emoji collapses to nothing while the trailing space
becomes a leading hyphen.
- It removes punctuation but preserves underscores and hyphens.
Generating both slugs and registering both as valid anchors keeps the
checker honest for repos that mix Quarto-rendered (.qmd) and
GitHub-rendered (.md) content. When the two algorithms agree the
extra entry is harmless.
"""
cleaned = HEADING_CLEAN_RE.sub("", heading)
cleaned = cleaned.lower()
# Drop everything that isn't a word char, whitespace, or hyphen.
cleaned = re.sub(r"[^\w\s\-]", "", cleaned)
cleaned = re.sub(r"\s+", "-", cleaned)
return cleaned
def collect_anchors(text: str) -> set[str]:
"""Return the set of anchor ids defined in the given Markdown source.
Combines:
- Explicit `{#id}` attributes anywhere in the document.
- Auto-generated heading slugs.
"""
anchors: set[str] = set()
for m in ANCHOR_ATTR_RE.finditer(text):
anchors.add(m.group("id"))
for m in HEADING_RE.finditer(text):
heading = m.group("heading").strip()
# If a heading already declares {#id}, the explicit id wins; capture it
# AND skip the slugified form because the auto slug isn't generated.
explicit = ANCHOR_ATTR_RE.search(heading)
if explicit:
continue
slug = slugify(heading)
if slug:
anchors.add(slug)
gh_slug = github_slugify(heading)
if gh_slug:
anchors.add(gh_slug)
return anchors
def is_external(target: str) -> bool:
# Match schemes case-insensitively. Authors occasionally type `HTTP://` or
# `HTTPS://` (especially after auto-capitalize on mobile or in copied
# academic citations); these are still external URLs and out of scope for
# this offline checker. A strict prefix match would otherwise treat them
# as relative paths and report bogus failures.
head = target[:8].lower()
return any(head.startswith(s) for s in EXTERNAL_SCHEMES)
def is_quarto_xref(target: str) -> bool:
"""True for Quarto cross-ref targets like `@sec-foo`, `@fig-bar`.
These resolve through the project's cross-ref index at render time, not
through the filesystem. Validation belongs to the book toolchain's
`book-check-references` hook, not here.
"""
return target.startswith(QUARTO_XREF_PREFIXES)
def strip_html_comments(text: str) -> str:
"""Erase HTML comment bodies while preserving line/column offsets.
Each non-newline character inside a `<!-- ... -->` block becomes a
space; newlines stay intact. This keeps any subsequent line-number /
column reporting accurate, while preventing the link regex from
matching link-shaped tokens that are author-visible only (TikZ
source, review notes, etc.).
"""
def _blank(match: re.Match) -> str:
return re.sub(r"[^\n]", " ", match.group(0))
return HTML_COMMENT_RE.sub(_blank, text)
def split_target(target: str) -> tuple[str, str]:
"""Split a link target into (path, anchor)."""
if target.startswith("<") and target.endswith(">"):
target = target[1:-1]
if "#" not in target:
return target, ""
path, _, anchor = target.partition("#")
return path, anchor
def candidate_paths(source: Path, raw: str) -> list[Path]:
"""Return possible filesystem paths a link target could resolve to.
Quarto sites resolve relative links against the file's directory, but
`.qmd` files often link to a sibling without the extension (the rendered
output is `.html`). We also accept `index.qmd` shorthand for directory
targets.
"""
if not raw:
return []
base = source.parent
if raw.startswith("/"):
# Site-absolute paths are resolved at render time and depend on the
# site's configured `site-url` / base path. We can't validate them
# offline reliably, so skip with a soft pass.
return []
raw_path = Path(raw)
candidates = [base / raw_path]
# Sometimes authors write `foo` when they mean `foo.qmd` (rendered as
# `foo.html`). Only meaningful when raw has no extension.
if not raw_path.suffix:
candidates.append((base / raw_path).with_suffix(".qmd"))
candidates.append((base / raw_path).with_suffix(".md"))
candidates.append((base / raw_path).with_suffix(".html"))
candidates.append(base / raw_path / "index.qmd")
candidates.append(base / raw_path / "index.md")
elif raw_path.suffix == ".html":
# `foo.html` in source most likely points at sibling `foo.qmd`.
candidates.append((base / raw_path).with_suffix(".qmd"))
candidates.append((base / raw_path).with_suffix(".md"))
return candidates
def file_text_cache() -> "dict[Path, str]":
return {}
def read(path: Path, cache: dict[Path, str]) -> str | None:
if path in cache:
return cache[path]
try:
text = path.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
return None
cache[path] = text
return text
def check_file(path: Path, cache: dict[Path, str]) -> list[Problem]:
if not path.exists():
return [Problem(path, 0, "", "file does not exist")]
text = read(path, cache)
if text is None:
return [Problem(path, 0, "", "file unreadable as UTF-8")]
own_anchors = collect_anchors(text)
problems: list[Problem] = []
# Erase HTML comment bodies BEFORE scanning. We compute anchors against
# the unmodified text (you can legitimately put a {#id} inside a comment
# that documents something), but we never want to *follow* link-shaped
# tokens out of a comment block.
scan_text = strip_html_comments(text)
in_fence = False
fence_marker: str | None = None # The exact `` `... `` or `~~~...` that opened the block.
for line_no, line in enumerate(scan_text.splitlines(), start=1):
# Track fenced code blocks (``` or ~~~). Inside a fence, skip all link
# parsing — TikZ, raw LaTeX, and code samples otherwise produce piles of
# false positives that look like Markdown links.
fence_match = FENCE_OPEN_RE.match(line)
if fence_match:
marker = fence_match.group("fence")[0] * 3 # normalize length to 3
if not in_fence:
in_fence = True
fence_marker = marker
elif fence_marker is not None and line.lstrip().startswith(fence_marker):
in_fence = False
fence_marker = None
continue
if in_fence:
continue
# Strip inline code so backticked link-shaped strings don't trip us.
scan_line = INLINE_CODE_RE.sub("", line)
for m in LINK_RE.finditer(scan_line):
target = m.group("target")
if not target or is_external(target) or is_quarto_xref(target):
continue
path_part, anchor = split_target(target)
if not path_part:
# Pure same-file anchor.
if anchor and anchor not in own_anchors:
problems.append(
Problem(path, line_no, target, "anchor not found in this file")
)
continue
if Path(path_part).name in CI_INJECTED_BASENAMES:
continue # Built and dropped in by CI; not in the committed tree.
cands = candidate_paths(path, path_part)
if not cands:
continue # Site-absolute or unparsable; skip.
resolved = next((c for c in cands if c.exists()), None)
if resolved is None:
problems.append(
Problem(path, line_no, target, f"no such file (checked {len(cands)} candidate paths)")
)
continue
if anchor:
target_text = read(resolved, cache)
if target_text is None:
# File exists but isn't text we can scan (e.g. a binary).
# Treat anchor as opaque; don't flag.
continue
target_anchors = collect_anchors(target_text)
if anchor not in target_anchors:
problems.append(
Problem(path, line_no, target, f"anchor #{anchor} not found in {resolved.name}")
)
return problems
def staged_files(root: Path) -> list[Path]:
cmd = ["git", "diff", "--cached", "--name-only", "--diff-filter=ACMR"]
try:
out = subprocess.check_output(cmd, cwd=root, text=True)
except (subprocess.CalledProcessError, FileNotFoundError) as exc:
sys.stderr.write(f"check-internal-links: failed to list staged files: {exc}\n")
sys.exit(2)
files = []
for line in out.splitlines():
line = line.strip()
if not line:
continue
if not (line.endswith(".md") or line.endswith(".qmd")):
continue
files.append((root / line).resolve())
return files
def discover_files(root: Path) -> list[Path]:
files: list[Path] = []
for glob in DEFAULT_GLOBS:
for candidate in root.glob(glob):
if any(part in EXCLUDE_DIRS for part in candidate.parts):
continue
files.append(candidate)
return files
def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("files", nargs="*", help="Specific .md/.qmd files to check.")
parser.add_argument("--staged", action="store_true", help="Check files staged for commit.")
parser.add_argument("--quiet", "-q", action="store_true", help="Only print failures.")
parser.add_argument(
"--exclude",
action="append",
default=[],
metavar="GLOB",
help="Exclude files whose repo-relative path matches this glob. Repeatable.",
)
args = parser.parse_args(argv)
if args.staged and args.files:
parser.error("--staged is mutually exclusive with explicit FILES.")
root = REPO_ROOT
if args.staged:
files = staged_files(root)
elif args.files:
files = []
for raw in args.files:
p = Path(raw)
if not p.is_absolute():
p = (root / p).resolve()
if p.suffix not in (".md", ".qmd"):
continue
if any(part in EXCLUDE_DIRS for part in p.parts):
continue
files.append(p)
else:
files = discover_files(root)
if args.exclude:
import fnmatch
kept = []
for f in files:
try:
rel = str(f.relative_to(root))
except ValueError:
rel = str(f)
if any(fnmatch.fnmatch(rel, pat) for pat in args.exclude):
continue
kept.append(f)
files = kept
if not files:
if not args.quiet:
print("check-internal-links: no .md/.qmd files to check.")
return 0
cache: dict[Path, str] = {}
all_problems: list[Problem] = []
for path in sorted(set(files)):
all_problems.extend(check_file(path, cache))
if all_problems:
for prob in all_problems:
print(prob.render(root))
print(f"\ncheck-internal-links: {len(all_problems)} broken internal link(s) in {len({p.file for p in all_problems})} file(s).", file=sys.stderr)
return 1
if not args.quiet:
print(f"check-internal-links: OK ({len(files)} file(s) scanned).")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))