Files
cs249r_book/book/cli/commands/validate.py
Vijay Janapa Reddi 72d64a5499 cell updates
2026-02-28 13:03:38 -05:00

3120 lines
135 KiB
Python

"""
Native validation commands for MLSysBook Binder CLI.
Validation logic is implemented in Binder where possible (e.g. references,
citations, labels, figures, rendering). Some checks still delegate to scripts
under book/tools/scripts/ (tables, spelling, epub, sources, grid-tables,
images). See book/cli/BINDER_NATIVE_AUDIT.md for the full list.
"""
from __future__ import annotations
import argparse
import json
import os
import re
import time
from collections import defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from . import reference_check
console = Console()
@dataclass
class ValidationIssue:
file: str
line: int
code: str
message: str
severity: str = "error"
context: str = ""
def to_dict(self) -> Dict[str, Any]:
return {
"file": self.file,
"line": self.line,
"code": self.code,
"message": self.message,
"severity": self.severity,
"context": self.context,
}
@dataclass
class ValidationRunResult:
name: str
description: str
files_checked: int
issues: List[ValidationIssue]
elapsed_ms: int
@property
def passed(self) -> bool:
return not any(i.severity == "error" for i in self.issues)
def to_dict(self) -> Dict[str, Any]:
return {
"name": self.name,
"description": self.description,
"files_checked": self.files_checked,
"passed": self.passed,
"issue_count": len(self.issues),
"elapsed_ms": self.elapsed_ms,
"issues": [issue.to_dict() for issue in self.issues],
}
INLINE_REF_PATTERN = re.compile(r"`\{python\}\s+(\w+(?:\.\w+)?)`")
CELL_START_PATTERN = re.compile(r"^```\{python\}|^```python")
CELL_END_PATTERN = re.compile(r"^```\s*$")
ASSIGN_PATTERN = re.compile(r"^([A-Za-z_]\w*)\s*=")
# Tuple unpacking: "a, b = ..." — captures all names on the left side
TUPLE_ASSIGN_PATTERN = re.compile(r"^((?:[A-Za-z_]\w*\s*,\s*)+[A-Za-z_]\w*)\s*=")
CLASS_DEF_PATTERN = re.compile(r"^class\s+(\w+)\s*[:(]")
GRID_TABLE_SEP_PATTERN = re.compile(r"^\+[-:=+]+\+$")
LATEX_INLINE_PATTERN = re.compile(r"(?<!\\)\$\s*`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`|`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`\s*(?<!\\)\$")
LATEX_ADJACENT_PATTERN = re.compile(r"`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu)\$")
CITATION_REF_PATTERN = re.compile(r"@([A-Za-z0-9_:\-.]+)")
CITATION_BRACKET_PATTERN = re.compile(r"\[-?@[A-Za-z0-9_:\-.]+(?:;\s*-?@[A-Za-z0-9_:\-.]+)*\]")
LABEL_DEF_PATTERNS = {
"Figure": [
re.compile(r"\{#(fig-[\w-]+)"), # {#fig-xyz ...}
re.compile(r"#\|\s*label:\s*(fig-[\w-]+)"), # #| label: fig-xyz
re.compile(r"%%\|\s*label:\s*(fig-[\w-]+)"), # %%| label: fig-xyz (Jupyter)
],
"Table": [
re.compile(r"\{#(tbl-[\w-]+)"), # {#tbl-xyz}
re.compile(r"#\|\s*label:\s*(tbl-[\w-]+)"), # #| label: tbl-xyz
],
"Section": [
re.compile(r"\{#(sec-[\w-]+)"), # {#sec-xyz}
re.compile(r"^id:\s*(sec-[\w-]+)"), # YAML id: sec-xyz
],
"Equation": [re.compile(r"\{#(eq-[\w-]+)")], # {#eq-xyz}
"Listing": [
re.compile(r"\{#(lst-[\w-]+)"), # {#lst-xyz ...}
re.compile(r"#\|\s*label:\s*(lst-[\w-]+)"), # #| label: lst-xyz
],
}
LABEL_REF_PATTERN = re.compile(r"@((?:fig|tbl|sec|eq|lst)-[\w-]+)")
EXCLUDED_CITATION_PREFIXES = ("fig-", "tbl-", "sec-", "eq-", "lst-", "ch-", "nb-")
class ValidateCommand:
"""Native `binder check` command group (also available as `binder validate`).
Groups:
refs — inline-python, cross-refs, citations, inline patterns
labels — duplicate labels, orphaned/unreferenced labels
headers — section header IDs
footnotes — placement rules, reference integrity
figures — captions/alt-text, float flow, image files
rendering — render patterns, indexes, dropcaps, parts
all — run every check
"""
# Maps group name → list of (scope_name, runner_method_name) pairs.
# This is the single source of truth for the hierarchy.
GROUPS: Dict[str, List[tuple]] = {
"refs": [
("python-syntax", "_run_python_syntax"),
("inline-python", "_run_inline_python"),
("cross-refs", "_run_refs"),
("citations", "_run_citations"),
("inline", "_run_inline_refs"),
("self-ref", "_run_self_referential"),
],
"labels": [
("duplicates", "_run_duplicate_labels"),
("orphans", "_run_unreferenced_labels"),
("fig-labels", "_run_fig_label_underscores"),
],
"headers": [
("ids", "_run_headers"),
],
"footnotes": [
("placement", "_run_footnote_placement"),
("integrity", "_run_footnote_refs"),
("cross-chapter", "_run_footnote_cross_chapter"),
],
"figures": [
("captions", "_run_figures"),
("flow", "_run_float_flow"),
("files", "_run_images"),
],
"rendering": [
("patterns", "_run_rendering"),
("python-echo", "_run_python_echo"),
("indexes", "_run_indexes"),
("dropcaps", "_run_dropcaps"),
("parts", "_run_parts"),
("heading-levels", "_run_heading_levels"),
("duplicate-words", "_run_duplicate_words"),
("grid-tables", "_run_grid_tables"),
("tables", "_run_table_content"),
("ascii", "_run_ascii"),
("percent-spacing", "_run_percent_spacing"),
("unit-spacing", "_run_unit_spacing"),
("binary-units", "_run_binary_units"),
("contractions", "_run_contractions"),
("unblended-prose", "_run_unblended_prose"),
("times-spacing", "_run_times_spacing"),
],
"images": [
("formats", "_run_image_formats"),
("external", "_run_external_images"),
],
"json": [
("syntax", "_run_json_syntax"),
],
"units": [
("physics", "_run_unit_tests"),
],
"spelling": [
("prose", "_run_spelling_prose"),
("tikz", "_run_spelling_tikz"),
],
"epub": [
("structure", "_run_epub"),
],
"sources": [
("citations", "_run_sources"),
],
"references": [
("hallucinator", "_run_check_references"),
],
"content": [
("tree", "_run_content_tree"),
],
}
def __init__(self, config_manager, chapter_discovery):
self.config_manager = config_manager
self.chapter_discovery = chapter_discovery
def run(self, args: List[str]) -> bool:
all_group_names = list(self.GROUPS.keys()) + ["all"]
parser = argparse.ArgumentParser(
prog="binder check",
description="Run quality checks on book content",
add_help=True,
)
parser.add_argument(
"subcommand",
nargs="?",
choices=all_group_names,
help="Check group to run (refs, labels, headers, footnotes, figures, rendering, references, content, all)",
)
parser.add_argument("--scope", default=None, help="Narrow to a specific check within a group")
parser.add_argument("--path", default=None, help="File or directory path to check")
parser.add_argument("--vol1", action="store_true", help="Scope to Volume I")
parser.add_argument("--vol2", action="store_true", help="Scope to Volume II")
parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON output")
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
parser.add_argument("--citations-in-code", action="store_true", help="refs: check citations in code fences")
parser.add_argument("--citations-in-raw", action="store_true", help="refs: check citations in raw blocks")
parser.add_argument("--check-patterns", action="store_true", default=True, help="refs --scope inline: include pattern hazard checks (default: on)")
parser.add_argument("--no-check-patterns", action="store_false", dest="check_patterns", help="refs --scope inline: skip pattern hazard checks")
parser.add_argument("--check-scope", action="store_true", default=False, help="refs --scope inline: detect bare variable refs in class bodies that need ClassName.attr")
parser.add_argument("--no-check-scope", action="store_false", dest="check_scope", help="refs --scope inline: skip scope analysis")
parser.add_argument("--figures", action="store_true", help="labels: filter to figures")
parser.add_argument("--tables", action="store_true", help="labels: filter to tables")
parser.add_argument("--sections", action="store_true", help="labels: filter to sections")
parser.add_argument("--equations", action="store_true", help="labels: filter to equations")
parser.add_argument("--listings", action="store_true", help="labels: filter to listings")
parser.add_argument("--all-types", action="store_true", help="labels: all label types")
parser.add_argument("-f", "--file", dest="refs_file", action="append", metavar="BIB", help="references: .bib file(s) to check")
parser.add_argument("-o", "--output", dest="refs_output", metavar="FILE", help="references: write report to FILE")
parser.add_argument("--limit", type=int, dest="refs_limit", metavar="N", help="references: check only first N refs (quick test)")
parser.add_argument("--skip-verified", dest="refs_skip_verified", action="store_true", help="references: skip refs already verified in cache")
parser.add_argument("--thorough", dest="refs_thorough", action="store_true", help="references: revalidate all refs (ignore cache)")
parser.add_argument("--refs-cache", dest="refs_cache", metavar="FILE", help="references: cache file (default: .references_verified.json in repo root)")
parser.add_argument("--only-from-report", dest="refs_only_from_report", metavar="FILE", help="references: validate only keys that had issues in this report file")
parser.add_argument("--only-keys", dest="refs_only_keys_file", metavar="FILE", help="references: validate only keys listed in FILE (one key per line)")
try:
ns = parser.parse_args(args)
except SystemExit:
# argparse uses SystemExit(0) for --help and non-zero for parse errors.
return ("-h" in args) or ("--help" in args)
if not ns.subcommand:
self._print_check_help()
return False
root_path = self._resolve_path(ns.path, ns.vol1, ns.vol2)
if not root_path.exists():
self._emit(ns.json, {"status": "error", "message": f"Path not found: {root_path}"}, failed=True)
return False
runs: List[ValidationRunResult] = []
if ns.subcommand == "all":
for group_name in self.GROUPS:
runs.extend(self._run_group(group_name, None, root_path, ns))
else:
group_name = ns.subcommand
scope = ns.scope
if scope and not any(s == scope for s, _ in self.GROUPS.get(group_name, [])):
valid = [s for s, _ in self.GROUPS[group_name]]
console.print(f"[red]Unknown scope '{scope}' for group '{group_name}'.[/red]")
console.print(f"[yellow]Valid scopes: {', '.join(valid)}[/yellow]")
return False
runs.extend(self._run_group(group_name, scope, root_path, ns))
any_failed = any(not run.passed for run in runs)
summary = {
"status": "failed" if any_failed else "passed",
"command": ns.subcommand,
"path": str(root_path),
"runs": [run.to_dict() for run in runs],
"total_issues": sum(len(run.issues) for run in runs),
}
if ns.json:
print(json.dumps(summary, indent=2))
else:
self._print_human_summary(summary, verbose=ns.verbose)
return not any_failed
# ------------------------------------------------------------------
# Group dispatch
# ------------------------------------------------------------------
def _run_group(
self,
group: str,
scope: Optional[str],
root: Path,
ns: argparse.Namespace,
) -> List[ValidationRunResult]:
"""Run all checks in *group*, or just the one matching *scope*."""
results: List[ValidationRunResult] = []
for scope_name, method_name in self.GROUPS[group]:
if scope and scope != scope_name:
continue
method = getattr(self, method_name)
# Some runners need extra kwargs
if method_name == "_run_refs":
checks_code = ns.citations_in_code or (not ns.citations_in_code and not ns.citations_in_raw)
checks_raw = ns.citations_in_raw or (not ns.citations_in_code and not ns.citations_in_raw)
results.append(method(root, citations_in_code=checks_code, citations_in_raw=checks_raw))
elif method_name == "_run_inline_refs":
results.append(method(root, check_patterns=ns.check_patterns,
check_scope=getattr(ns, 'check_scope', False)))
elif method_name in ("_run_duplicate_labels", "_run_unreferenced_labels"):
results.append(method(root, self._selected_label_types(ns)))
elif method_name == "_run_check_references":
results.append(method(root, ns))
else:
results.append(method(root))
return results
def _print_check_help(self) -> None:
"""Print a nicely formatted help for the check command."""
table = Table(show_header=True, header_style="bold cyan", box=None)
table.add_column("Group", style="cyan", width=14)
table.add_column("Scopes", style="yellow", width=38)
table.add_column("Description", style="white", width=32)
descriptions = {
"refs": "References, citations, inline Python, self-ref",
"labels": "Duplicate labels, orphans, fig-label underscores",
"headers": "Section header IDs ({#sec-...})",
"footnotes": "Placement, integrity, cross-chapter duplicates",
"figures": "Captions, float flow, image files",
"rendering": "Patterns, indexes, dropcaps, headings, typos, tables, ASCII",
"images": "Image file formats, external URLs",
"json": "JSON file syntax validation",
"units": "Physics engine unit conversion tests",
"spelling": "Prose and TikZ spell checking (requires aspell)",
"epub": "EPUB file validation",
"sources": "Source citation analysis and validation",
"references": "Bibliography vs academic DBs (hallucinator)",
"content": "Content tree (shared/, frontmatter/ required)",
}
for group_name, checks in self.GROUPS.items():
scopes = ", ".join(s for s, _ in checks)
desc = descriptions.get(group_name, "")
table.add_row(group_name, scopes, desc)
table.add_row("all", "(everything)", "Run all checks")
console.print(Panel(table, title="binder check <group> [--scope <name>]", border_style="cyan"))
console.print("[dim]Examples:[/dim]")
console.print(" [cyan]./binder check refs[/cyan] [dim]# all reference checks[/dim]")
console.print(" [cyan]./binder check refs --scope citations[/cyan] [dim]# only citation check[/dim]")
console.print(" [cyan]./binder check figures --vol1[/cyan] [dim]# all figure checks, Vol I[/dim]")
console.print(" [cyan]./binder check all[/cyan] [dim]# everything[/dim]")
console.print()
# ------------------------------------------------------------------
def _resolve_path(self, path_arg: Optional[str], vol1: bool, vol2: bool) -> Path:
if path_arg:
path = Path(path_arg)
if not path.is_absolute():
path = (Path.cwd() / path).resolve()
return path
base = self.config_manager.book_dir / "contents"
if vol1 and not vol2:
return base / "vol1"
if vol2 and not vol1:
return base / "vol2"
return base
def _selected_label_types(self, ns: argparse.Namespace) -> Dict[str, List[re.Pattern[str]]]:
explicit = ns.figures or ns.tables or ns.sections or ns.equations or ns.listings
if ns.all_types:
return LABEL_DEF_PATTERNS
if explicit:
selected: Dict[str, List[re.Pattern[str]]] = {}
if ns.figures:
selected["Figure"] = LABEL_DEF_PATTERNS["Figure"]
if ns.tables:
selected["Table"] = LABEL_DEF_PATTERNS["Table"]
if ns.sections:
selected["Section"] = LABEL_DEF_PATTERNS["Section"]
if ns.equations:
selected["Equation"] = LABEL_DEF_PATTERNS["Equation"]
if ns.listings:
selected["Listing"] = LABEL_DEF_PATTERNS["Listing"]
return selected
# default: all label types
return LABEL_DEF_PATTERNS
def _qmd_files(self, root: Path) -> List[Path]:
if root.is_file():
return [root] if root.suffix == ".qmd" else []
return sorted(root.rglob("*.qmd"))
def _read_text(self, path: Path) -> str:
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return path.read_text(encoding="utf-8", errors="ignore")
def _relative_file(self, path: Path) -> str:
try:
return str(path.relative_to(self.config_manager.book_dir))
except ValueError:
return str(path)
def _run_python_syntax(self, root: Path) -> ValidationRunResult:
"""Compile every ```{python} code block to catch syntax errors."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
block_start_re = re.compile(r"^```\{python\}")
block_end_re = re.compile(r"^```\s*$")
for file in files:
content = self._read_text(file)
lines = content.split("\n")
rel = str(file.relative_to(root)) if file.is_relative_to(root) else str(file)
in_block = False
block_lines: List[str] = []
block_start_line = 0
for i, line in enumerate(lines, start=1):
if block_start_re.match(line):
in_block = True
block_lines = []
block_start_line = i
continue
if in_block and block_end_re.match(line):
in_block = False
# Skip YAML-style #| directives before compiling
source_lines = [
ln for ln in block_lines
if not ln.strip().startswith("#|")
]
source = "\n".join(source_lines)
if not source.strip():
continue
try:
compile(source, f"{rel}:{block_start_line}", "exec")
except SyntaxError as exc:
err_line = block_start_line + (exc.lineno or 1)
issues.append(ValidationIssue(
file=rel,
line=err_line,
code="python_syntax",
message=f"Python syntax error: {exc.msg}",
severity="error",
context=(exc.text or "").strip()[:120],
))
continue
if in_block:
block_lines.append(line)
return ValidationRunResult(
name="python-syntax",
description="Validate Python code block syntax (compile check)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _run_inline_python(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
regex_checks = [
("missing_backtick", re.compile(r"(?<!`)(\{python\}\s+\w+`)"), "Missing opening backtick before {python}", "error"),
("dollar_as_backtick", re.compile(r"\$\{python\}\s+\w+`"), "Dollar sign used instead of backtick before {python}", "error"),
("display_math", re.compile(r"\$\$[^$]*`?\{python\}"), "Inline Python inside $$...$$ display math", "error"),
# NOTE: $\times$ adjacent to inline Python is the PREFERRED convention.
# Only flag non-_str variables inside $...$ math (decimal stripping risk).
("latex_adjacent_raw", re.compile(r"`\{python\}\s+(?!\w+_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu|le|ge|neq|pm|cdot|div)"), "Non-_str inline Python adjacent to LaTeX operator (decimal stripping risk)", "warning"),
]
for file in files:
lines = self._read_text(file).splitlines()
in_code_block = False
in_grid = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code_block = not in_code_block
continue
if in_code_block:
continue
for code, pattern, message, severity in regex_checks:
for match in pattern.finditer(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code=code,
message=message,
severity=severity,
context=match.group(0)[:160],
))
if LATEX_INLINE_PATTERN.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="python_in_math",
message="Inline Python inside $...$ math can render incorrectly",
severity="error",
context=line.strip()[:160],
))
if GRID_TABLE_SEP_PATTERN.match(stripped):
in_grid = True
elif in_grid and not stripped.startswith("|") and stripped:
in_grid = False
if in_grid and "`{python}" in line:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="grid_table_python",
message="Inline Python in grid table; convert to pipe table",
severity="error",
context=line.strip()[:160],
))
# Unwrapped {python} — missing backticks entirely
# Match {python} NOT preceded by ` and NOT at start of #| label line
if "{python}" in line and not stripped.startswith("#|"):
for um in re.finditer(r"(?<!`)\{python\}\s+\w+", line):
# Make sure it's not inside a backtick span
before = line[:um.start()]
if before.count("`") % 2 == 0: # even backticks = not inside span
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="unwrapped_python",
message="Inline Python missing backtick wrapping — will render as literal text",
severity="error",
context=um.group(0)[:120],
))
# Inline Python in headings — fragile for TOC/bookmarks/PDF
if stripped.startswith("#") and not stripped.startswith("#|") and "`{python}" in line:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="python_in_heading",
message="Inline Python in heading — fragile for TOC, bookmarks, and PDF",
severity="warning",
context=stripped[:120],
))
return ValidationRunResult(
name="inline-python",
description="Validate inline Python syntax and placement",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _run_refs(self, root: Path, citations_in_code: bool, citations_in_raw: bool) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
fenced_code_pattern = re.compile(r"```\{([^}]+)\}(.*?)```", re.DOTALL)
raw_block_pattern = re.compile(r"```\{=(html|latex|tex)\}(.*?)```", re.DOTALL | re.IGNORECASE)
problematic_classes = {"tikz", "latex", "tex"}
for file in files:
content = self._read_text(file)
if citations_in_code:
for match in fenced_code_pattern.finditer(content):
attrs = match.group(1)
code_content = match.group(2)
class_match = re.search(r"\.([A-Za-z][A-Za-z0-9_-]*)", attrs)
cls = class_match.group(1).lower() if class_match else "unknown"
if cls not in problematic_classes:
continue
for cite_match in CITATION_BRACKET_PATTERN.finditer(code_content):
offset = match.start() + len(f"```{{{attrs}}}") + cite_match.start()
line_no = content[:offset].count("\n") + 1
line = content.splitlines()[line_no - 1] if line_no - 1 < len(content.splitlines()) else ""
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="citation_in_code",
message=f"Citation in .{cls} code block will not be processed",
severity="error",
context=line.strip()[:160],
))
if citations_in_raw:
for match in raw_block_pattern.finditer(content):
raw_type = match.group(1).lower()
block = match.group(2)
for cite_match in CITATION_BRACKET_PATTERN.finditer(block):
offset = match.start() + cite_match.start()
line_no = content[:offset].count("\n") + 1
line = content.splitlines()[line_no - 1] if line_no - 1 < len(content.splitlines()) else ""
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="citation_in_raw",
message=f"Citation in raw {raw_type} block will not be processed",
severity="error",
context=line.strip()[:160],
))
return ValidationRunResult(
name="refs",
description="Validate citation/reference placement in raw/code blocks",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _bibliography_for_qmd(self, file: Path) -> Optional[Path]:
"""Resolve the volume backmatter references.bib for a .qmd from its path."""
try:
rel = file.relative_to(self.config_manager.book_dir)
except ValueError:
return None
parts = rel.parts
if "vol1" in parts:
bib_file = self.config_manager.book_dir / "contents" / "vol1" / "backmatter" / "references.bib"
elif "vol2" in parts:
bib_file = self.config_manager.book_dir / "contents" / "vol2" / "backmatter" / "references.bib"
else:
return None
return bib_file if bib_file.exists() else None
def _run_citations(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
bib_key_pattern = re.compile(r"@\w+\{([^,\s]+)")
for file in files:
bib_file = self._bibliography_for_qmd(file)
if bib_file is None:
continue
content = self._read_text(file)
bib_content = self._read_text(bib_file)
bib_keys = set(bib_key_pattern.findall(bib_content))
# Strip YAML frontmatter (--- ... --- at file top) to avoid email false positives
qmd_content_no_code = re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL)
# Strip HTML style/script blocks to avoid CSS @media false positives
qmd_content_no_code = re.sub(r"<style\b[^>]*>.*?</style>", "", qmd_content_no_code, flags=re.DOTALL)
qmd_content_no_code = re.sub(r"```.*?```", "", qmd_content_no_code, flags=re.DOTALL)
qmd_content_no_code = re.sub(r"`[^`]+`", "", qmd_content_no_code)
refs = set(CITATION_REF_PATTERN.findall(qmd_content_no_code))
refs = {r.rstrip(".,;:") for r in refs if not r.startswith(EXCLUDED_CITATION_PREFIXES)}
refs = {r for r in refs if not re.match(r"^\d+\.\d+", r)}
missing = sorted(refs - bib_keys)
for key in missing:
line_no = self._line_for_token(content, f"@{key}")
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="missing_citation",
message=f"Citation key @{key} missing in bibliography",
severity="error",
context=f"@{key}",
))
return ValidationRunResult(
name="citations",
description="Validate citation keys against bibliography files",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _run_duplicate_labels(self, root: Path, label_types: Dict[str, List[re.Pattern[str]]]) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
definitions: Dict[str, List[Tuple[Path, int, str]]] = {}
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for label_type, patterns in label_types.items():
for pattern in patterns:
for match in pattern.finditer(line):
label = match.group(1)
definitions.setdefault(label, []).append((file, idx, label_type))
for label, locations in definitions.items():
if len(locations) <= 1:
continue
for file, line_no, label_type in locations:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="duplicate_label",
message=f"Duplicate {label_type.lower()} label: {label}",
severity="error",
context=label,
))
return ValidationRunResult(
name="duplicate-labels",
description="Detect duplicate label definitions",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _run_unreferenced_labels(self, root: Path, label_types: Dict[str, List[re.Pattern[str]]]) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
defined: Dict[str, Tuple[Path, int, str]] = {}
references: Dict[str, List[Tuple[Path, int]]] = {}
for file in files:
lines = self._read_text(file).splitlines()
for idx, line in enumerate(lines, 1):
for label_type, patterns in label_types.items():
for pattern in patterns:
for match in pattern.finditer(line):
defined.setdefault(match.group(1), (file, idx, label_type))
for match in LABEL_REF_PATTERN.finditer(line):
label = match.group(1)
references.setdefault(label, []).append((file, idx))
# unreferenced definitions (skip section defaults, consistent with legacy behavior)
for label, (file, line_no, label_type) in defined.items():
if label_type == "Section":
continue
if label not in references:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="unreferenced_label",
message=f"{label_type} label {label} is never referenced",
severity="warning",
context=label,
))
# unresolved references
defined_labels = set(defined.keys())
for label, locations in references.items():
if label in defined_labels:
continue
for file, line_no in locations:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="unresolved_reference",
message=f"Reference @{label} has no matching label definition",
severity="error",
context=f"@{label}",
))
return ValidationRunResult(
name="unreferenced-labels",
description="Detect unreferenced labels and unresolved references",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _run_inline_refs(self, root: Path, check_patterns: bool,
check_scope: bool = False) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
yaml_option_inline = re.compile(r"^#\|\s*(fig-cap|tbl-cap|lst-cap|fig-alt):\s*.*`\{python\}")
caption_syntax_inline = re.compile(r"^:\s+.*`\{python\}.*\{#(tbl|fig|lst)-")
inline_fstring = re.compile(r"`\{python\}\s*f\"[^`]+`")
inline_func_call = re.compile(r"`\{python\}\s*\w+\([^`]+\)`")
for file in files:
lines = self._read_text(file).splitlines()
refs: List[Tuple[int, str]] = []
compute_vars: Set[str] = set()
compute_classes: Set[str] = set()
in_cell = False
for idx, line in enumerate(lines, 1):
if CELL_START_PATTERN.match(line.strip()):
in_cell = True
continue
if in_cell and CELL_END_PATTERN.match(line.strip()):
in_cell = False
continue
if in_cell:
cls_match = CLASS_DEF_PATTERN.match(line.strip())
if cls_match:
compute_classes.add(cls_match.group(1))
assign = ASSIGN_PATTERN.match(line.strip())
if assign:
compute_vars.add(assign.group(1))
tuple_assign = TUPLE_ASSIGN_PATTERN.match(line.strip())
if tuple_assign:
for name in re.split(r'\s*,\s*', tuple_assign.group(1)):
compute_vars.add(name.strip())
for match in INLINE_REF_PATTERN.finditer(line):
refs.append((idx, match.group(1)))
for line_no, ref in refs:
if "." in ref:
cls_name = ref.split(".", 1)[0]
resolved = cls_name in compute_classes or cls_name in compute_vars
else:
resolved = ref in compute_vars
if not resolved:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="undefined_inline_ref",
message=f"Inline reference `{ref}` is not defined in python cells",
severity="error",
context=f"`{{python}} {ref}`",
))
if check_patterns:
in_grid = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if LATEX_INLINE_PATTERN.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="latex_math_inline_python",
message="Inline Python inside LaTeX math can strip decimals",
severity="warning",
context=stripped[:160],
))
if LATEX_ADJACENT_PATTERN.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="latex_adjacent_inline_python",
message="Inline Python adjacent to LaTeX operator is fragile",
severity="warning",
context=stripped[:160],
))
if GRID_TABLE_SEP_PATTERN.match(stripped):
in_grid = True
elif in_grid and stripped and not stripped.startswith("|"):
in_grid = False
if in_grid and "`{python}" in line:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="grid_table_inline_python",
message="Inline Python in grid tables is unsupported",
severity="error",
context=stripped[:160],
))
if inline_fstring.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="inline_fstring",
message="Inline f-string should be precomputed in Python cell",
severity="warning",
context=stripped[:160],
))
if inline_func_call.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="inline_function_call",
message="Inline function call should be precomputed in Python cell",
severity="warning",
context=stripped[:160],
))
if yaml_option_inline.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="yaml_option_inline_python",
message="Inline Python in YAML fig/tbl/lst metadata will not render",
severity="error",
context=stripped[:160],
))
if caption_syntax_inline.search(line):
issues.append(ValidationIssue(
file=self._relative_file(file),
line=idx,
code="caption_inline_python",
message="Inline Python in caption syntax will not render",
severity="error",
context=stripped[:160],
))
if check_scope:
from book.quarto.mlsys.validate_inline_refs import check_scope as _check_scope, BOOK_ROOT
try:
scope_warnings = _check_scope(file, verbose=False)
for filepath, lineno, check_type, msg in scope_warnings:
issues.append(ValidationIssue(
file=self._relative_file(file),
line=lineno,
code=check_type.lower(),
message=msg,
severity="warning",
context="",
))
except Exception:
pass
return ValidationRunResult(
name="inline-refs",
description="Validate inline Python refs and rendering hazard patterns",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Headers (ported from manage_section_ids.py --verify)
# ------------------------------------------------------------------
def _run_headers(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
header_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s*\{[^}]*\})?$")
div_start_pat = re.compile(r"^:::\s*\{\.")
div_end_pat = re.compile(r"^:::\s*$")
code_block_pat = re.compile(r"^```[^`]*$")
sec_id_pat = re.compile(r"\{#sec-[^}]+\}")
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
in_div = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if code_block_pat.match(stripped):
in_code = not in_code
continue
if in_code:
continue
if div_start_pat.match(stripped):
in_div = True
continue
if div_end_pat.match(stripped):
in_div = False
continue
if in_div:
continue
match = header_pat.match(line)
if not match:
continue
# Extract existing attributes
existing_attrs = ""
if "{" in line:
attrs_start = line.find("{")
attrs_end = line.rfind("}")
if attrs_end > attrs_start:
existing_attrs = line[attrs_start : attrs_end + 1]
if ".unnumbered" in existing_attrs:
continue
if not sec_id_pat.search(line):
title = match.group(2).strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="missing_section_id",
message=f"Header missing section ID: {title}",
severity="error",
context=line.strip()[:160],
)
)
return ValidationRunResult(
name="headers",
description="Verify section headers have {#sec-...} IDs",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Footnote Placement (ported from check_forbidden_footnotes.py)
# ------------------------------------------------------------------
def _run_footnote_placement(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
fn_pat = re.compile(r"\[\^fn-[\w-]+\]")
inline_fn_pat = re.compile(r"\^\[[^\]]+\]")
table_sep_pat = re.compile(r"^\|[\s\-:+]+\|")
for file in files:
lines = self._read_text(file).splitlines()
div_depth = 0
div_start_line = 0
for idx, line in enumerate(lines, 1):
stripped = line.strip()
# Track div nesting
if re.match(r"^:{3,4}\s*\{", stripped) or re.match(r"^:{3,4}\s+\w", stripped):
div_depth += 1
if div_depth == 1:
div_start_line = idx
elif re.match(r"^:{3,4}\s*$", stripped):
if div_depth > 0:
div_depth -= 1
if div_depth == 0:
div_start_line = 0
# Check inline footnotes (always forbidden)
for m in inline_fn_pat.finditer(line):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="inline_footnote",
message=f"Inline footnote syntax; use [^fn-name] reference format",
severity="error",
context=m.group(0)[:80],
)
)
footnotes = fn_pat.findall(line)
if not footnotes:
continue
# Table cell check
if stripped.startswith("|") and stripped.count("|") >= 2 and not table_sep_pat.match(stripped):
for fn in footnotes:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="footnote_in_table",
message=f"Footnote {fn} in table cell",
severity="error",
context=stripped[:80],
)
)
# YAML caption check
if re.match(r"^\s*(fig-cap|tbl-cap):", line):
cap_type = "figure" if "fig-cap" in line else "table"
for fn in footnotes:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code=f"footnote_in_{cap_type}_caption",
message=f"Footnote {fn} in {cap_type} caption",
severity="error",
context=stripped[:80],
)
)
# Markdown caption check
if re.match(r"^:\s*\*\*[^*]+\*\*:", line):
for fn in footnotes:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="footnote_in_markdown_caption",
message=f"Footnote {fn} in markdown caption",
severity="error",
context=stripped[:80],
)
)
# Callout title check
if re.match(r"^:{3,4}\s*\{.*title=", stripped):
title_match = re.search(r'title="([^"]*)"', line)
if title_match and fn_pat.search(title_match.group(1)):
for fn in fn_pat.findall(title_match.group(1)):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="footnote_in_callout_title",
message=f"Footnote {fn} in callout title (breaks LaTeX)",
severity="error",
context=stripped[:80],
)
)
# Div block check
if div_depth > 0 and div_start_line != idx:
for fn in footnotes:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="footnote_in_div",
message=f"Footnote {fn} inside div block (started line {div_start_line})",
severity="error",
context=stripped[:80],
)
)
return ValidationRunResult(
name="footnote-placement",
description="Check footnotes in forbidden locations",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Footnote Refs (ported from footnote_cleanup.py --validate)
# ------------------------------------------------------------------
def _run_footnote_refs(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
ref_pat = re.compile(r"\[\^([^]]+)\]")
def_pat = re.compile(r"^\[\^([^]]+)\]:\s*(.+)$", re.MULTILINE)
for file in files:
content = self._read_text(file)
lines = content.split("\n")
# Collect definitions
fn_defs: Dict[str, str] = {}
for m in def_pat.finditer(content):
fn_defs[m.group(1)] = m.group(2)
# Collect references (excluding definition lines themselves)
fn_refs: Dict[str, List[int]] = defaultdict(list)
for line_num, line in enumerate(lines, 1):
for m in ref_pat.finditer(line):
fn_id = m.group(1)
dm = def_pat.match(line)
if dm and dm.group(1) == fn_id:
continue # definition line, not a reference
fn_refs[fn_id].append(line_num)
# Undefined references
for fn_id in sorted(set(fn_refs.keys()) - set(fn_defs.keys())):
first_line = fn_refs[fn_id][0]
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=first_line,
code="undefined_footnote_ref",
message=f"Undefined footnote reference: [^{fn_id}]",
severity="error",
context=f"[^{fn_id}]",
)
)
# Unused definitions
for fn_id in sorted(set(fn_defs.keys()) - set(fn_refs.keys())):
def_line = self._line_for_token(content, f"[^{fn_id}]:")
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=def_line,
code="unused_footnote_def",
message=f"Unused footnote definition: [^{fn_id}]",
severity="warning",
context=f"[^{fn_id}]:",
)
)
# Duplicate definitions
def_counts: Dict[str, int] = defaultdict(int)
for line in lines:
dm = re.match(r"^\[\^([^]]+)\]:", line)
if dm:
def_counts[dm.group(1)] += 1
for fn_id, count in def_counts.items():
if count > 1:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=self._line_for_token(content, f"[^{fn_id}]:"),
code="duplicate_footnote_def",
message=f"Duplicate footnote definition ({count}x): [^{fn_id}]",
severity="error",
context=f"[^{fn_id}]:",
)
)
# Missing blank line before footnote definition
# Pandoc requires footnote definitions to start a new block.
# Without a preceding blank line, Pandoc treats the definition
# as continuation text and renders [^fn-name] as literal text.
fn_def_line_pat = re.compile(r"^\[\^[^\]]+\]:")
for idx, line in enumerate(lines):
if fn_def_line_pat.match(line) and idx > 0:
prev = lines[idx - 1]
if prev.strip(): # previous line is not blank
fn_match = re.match(r"^\[\^([^\]]+)\]:", line)
fn_id_str = fn_match.group(1) if fn_match else "?"
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx + 1,
code="footnote_missing_blank_line",
message=(
f"Footnote definition [^{fn_id_str}] has no blank line before it — "
f"Pandoc will not parse it as a footnote"
),
severity="error",
context=f"prev: {prev.strip()[:60]}",
)
)
return ValidationRunResult(
name="footnote-refs",
description="Validate footnote references and definitions",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Figures (ported from check_figure_completeness.py)
# ------------------------------------------------------------------
def _run_figures(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
fig_id_pat = re.compile(r"\{#(fig-[a-zA-Z0-9_-]+)[\s}]")
md_cap_pat = re.compile(r"!\[(.+?)\]\(")
for file in files:
lines = self._read_text(file).splitlines()
seen_ids: Set[str] = set()
# Pass 1: attribute-based figures
for idx, line in enumerate(lines, 1):
m = fig_id_pat.search(line)
if not m:
continue
fig_id = m.group(1)
has_cap = bool(re.search(r'fig-cap="[^"]+', line))
has_alt = bool(re.search(r'fig-alt="[^"]+', line))
if "![" in line:
md_m = md_cap_pat.search(line)
if md_m and md_m.group(1).strip():
has_cap = True
seen_ids.add(fig_id)
missing = []
if not has_cap:
missing.append("caption")
if not has_alt:
missing.append("alt-text")
if missing:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="incomplete_figure",
message=f"Figure {fig_id} missing: {', '.join(missing)}",
severity="error",
context=line.strip()[:120],
)
)
# Pass 2: code-cell figures
in_code = False
code_start = 0
cell_opts: Dict[str, str] = {}
for idx, line in enumerate(lines, 1):
stripped = line.rstrip()
if not in_code and re.match(r"^```\{(?:python|r|julia|ojs)", stripped):
in_code = True
code_start = idx
cell_opts = {}
continue
if in_code and stripped == "```":
label = cell_opts.get("label", "")
if label.startswith("fig-") and label not in seen_ids:
cap_val = cell_opts.get("fig-cap", "")
alt_val = cell_opts.get("fig-alt", "")
missing = []
if not cap_val:
missing.append("caption")
if not alt_val:
missing.append("alt-text")
if missing:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=code_start,
code="incomplete_figure",
message=f"Figure {label} missing: {', '.join(missing)}",
severity="error",
context=f"code-cell figure {label}",
)
)
seen_ids.add(label)
in_code = False
cell_opts = {}
continue
if in_code:
opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped)
if opt_m:
val = opt_m.group(2).strip().strip("\"'")
cell_opts[opt_m.group(1)] = val
return ValidationRunResult(
name="figures",
description="Check figures have captions and alt-text",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Float Flow (ported from figure_table_flow_audit.py)
# ------------------------------------------------------------------
def _run_float_flow(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
div_def_pat = re.compile(r":::\s*\{[^}]*#((?:fig|tbl)-[\w-]+)")
img_def_pat = re.compile(r"!\[.*?\]\(.*?\)\s*\{[^}]*#((?:fig|tbl)-[\w-]+)")
tbl_cap_pat = re.compile(r"^:\s+.*\{[^}]*#((?:fig|tbl)-[\w-]+)")
ref_pat = re.compile(r"@((?:fig|tbl)-[\w-]+)")
for file in files:
lines = self._read_text(file).splitlines()
defs: Dict[str, int] = {}
refs: Dict[str, List[int]] = defaultdict(list)
in_code = False
in_float = False
float_label: Optional[str] = None
code_spans: List[Tuple[int, int]] = []
code_start = 0
cell_opts: Dict[str, str] = {}
for idx, line in enumerate(lines, 1):
stripped = line.rstrip()
# Code block tracking
if not in_code and re.match(r"^```\{", stripped):
in_code = True
code_start = idx
cell_opts = {}
continue
if in_code and stripped == "```":
code_spans.append((code_start, idx))
label = cell_opts.get("label", "")
if label.startswith(("fig-", "tbl-")) and label not in defs:
defs[label] = code_start
in_code = False
cell_opts = {}
continue
if in_code:
opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped)
if opt_m:
cell_opts[opt_m.group(1)] = opt_m.group(2).strip().strip("\"'")
continue
# Attribute-based definitions
for pat in [div_def_pat, img_def_pat, tbl_cap_pat]:
m = pat.search(line)
if m:
label = m.group(1)
if label not in defs:
defs[label] = idx
if pat == div_def_pat:
in_float = True
float_label = label
# Track float block end
if in_float:
ls = line.strip()
if ls.startswith(":::") and not ls.startswith("::: {"):
in_float = False
float_label = None
# References
if "fig-cap=" in line or "fig-alt=" in line:
continue
for m in ref_pat.finditer(line):
label = m.group(1)
if in_float and label == float_label:
continue
refs[label].append(idx)
# Evaluate status
all_labels = set(defs.keys()) | set(refs.keys())
for label in sorted(all_labels):
def_line = defs.get(label)
ref_lines = refs.get(label, [])
first_ref = min(ref_lines) if ref_lines else None
if not def_line:
continue # XREF — informational, skip
if not first_ref:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=def_line,
code="orphan_float",
message=f"{'Figure' if label.startswith('fig-') else 'Table'} {label} defined but never referenced",
severity="warning",
context=label,
)
)
continue
# Compute prose gap
gap = def_line - first_ref
code_lines = 0
if gap > 0:
for cs, ce in code_spans:
os_ = max(first_ref, cs)
oe_ = min(def_line, ce)
if os_ <= oe_:
code_lines += oe_ - os_ + 1
prose_gap = gap - code_lines
if prose_gap > 30:
# Check closest reference
closest = min(ref_lines, key=lambda r: abs(def_line - r))
closest_gap = def_line - closest
if -5 <= closest_gap <= 30:
continue # OK
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=def_line,
code="late_float",
message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (too far after mention)",
severity="warning",
context=label,
)
)
elif prose_gap < -5:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=def_line,
code="early_float",
message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (appears before mention)",
severity="warning",
context=label,
)
)
return ValidationRunResult(
name="float-flow",
description="Audit figure/table placement relative to first reference",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Indexes (ported from check_index_placement.py)
# ------------------------------------------------------------------
def _run_indexes(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
checks = [
("index_on_heading", re.compile(r"^#{1,6}\s+.*\\index\{"), "\\index{} on same line as heading"),
("index_before_div", re.compile(r"\\index\{[^}]*\}:::"), "\\index{} directly before ::: (div/callout)"),
("index_after_div", re.compile(r"^::+\s+\{[^}]*\}\s*\\index\{"), "\\index{} on same line as div/callout"),
("index_before_footnote", re.compile(r"^\\index\{[^}]*\}.*\[\^[^\]]+\]:"), "\\index{} before footnote definition"),
]
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
if line.strip().startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for code, pattern, message in checks:
# Skip fig-cap lines for index_after_div
if code == "index_after_div" and "fig-cap=" in line:
continue
if pattern.search(line):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code=code,
message=message,
severity="error",
context=line.strip()[:120],
)
)
return ValidationRunResult(
name="indexes",
description="Check LaTeX \\index{} placement",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Rendering (ported from check_render_patterns.py)
# ------------------------------------------------------------------
def _run_rendering(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
regex_checks = [
("missing_opening_backtick", re.compile(r"(?<!`)(\{python\}\s+\w+`)"), "Missing opening backtick on inline Python", "error"),
("dollar_before_python", re.compile(r"\$\{python\}\s+\w+`"), "Dollar sign instead of backtick before {python}", "error"),
("quad_asterisks", re.compile(r"\*{4,}"), "Quad asterisks — likely malformed bold/italic", "warning"),
("footnote_in_table", re.compile(r"^\|.*\[\^fn-[^\]]+\].*\|"), "Footnote in table cell — may break PDF", "warning"),
("double_dollar_python", re.compile(r"\$\$[^$]*`\{python\}"), "Inline Python in display math", "error"),
# Currency: unescaped $ before number can be parsed as math. Use \$ for currency (see book-prose.md).
# Match: $1,000 (comma), $4.00 (decimal), $50 million/billion/etc.
# Exclude: $1.5 \times (math), $0.5$ (inline math), $4.6 / (division).
("unescaped_currency", re.compile(
r"(?<!\\)\$[0-9]{1,3}(?:,[0-9]{3})+(?=\s(?!\s*\\times)|,[0-9]|\)|$)" # $1,000, exclude $25,000 \times
r"|(?<!\\)\$[0-9]+\.[0-9]+(?=\s(?!\s*\\times)(?!\s*/)(?!\s*-)(?!\s*\+)(?!\s*\\ll)|,[0-9]|\)|$|/)(?!\\$)" # $4.00, exclude math
r"|(?<!\\)\$[0-9]+(?=\s+(?:million|billion|thousand|M|B|K|per|each|/))" # $50 million
), "Unescaped dollar before number — use \\$ for currency", "warning"),
]
grid_sep_pat = re.compile(r"^\+[-:=+]+\+$")
math_span_pat = re.compile(r"(?<!\\)\$(?!\$)(?!`)(.+?)(?<!\\)\$")
# Lowercase 'x' used as multiplication in prose (should be $\times$).
# Matches: `...`x word, NUMx word — but NOT hex (0x61), code, fig-alt, or \index.
# The pattern requires a lowercase letter after x+space, which naturally
# excludes hardware counts like "8x A100" (uppercase after x).
lowercase_x_mult_pat = re.compile(
r"""`x\s+[a-z]""" # `...`x word (after inline python)
r"""|"""
r"""\dx\s+[a-z]""" # Nx word (digit then x then lowercase)
)
# Hex literal pattern to exclude matches like 0x61, 0xff
hex_literal_pat = re.compile(r"0x[0-9a-fA-F]")
# fig-alt lines to skip
fig_alt_pat = re.compile(r'fig-alt\s*=\s*"')
for file in files:
lines = self._read_text(file).splitlines()
in_grid = False
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
# Code block tracking
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
# Grid table tracking
if grid_sep_pat.match(stripped):
in_grid = True
elif in_grid and not stripped.startswith("|") and not grid_sep_pat.match(stripped) and stripped:
in_grid = False
if in_grid and "`{python}" in line:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="grid_table_python",
message="Grid table with inline Python — convert to pipe table",
severity="error",
context=stripped[:120],
)
)
# Python inside $...$ math
for m in math_span_pat.finditer(line):
inner = m.group(1)
if "{python}" not in inner:
continue
inner_clean = re.sub(r"\^\{[^}]*`\{python\}[^`]*`[^}]*\}", "", inner)
if "{python}" in inner_clean:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="python_in_dollar_math",
message="Inline Python inside $...$ math block",
severity="error",
context=m.group(0)[:120],
)
)
# Lowercase 'x' used as multiplication in prose
# Skip fig-alt lines and index entries
if not fig_alt_pat.search(line) and not stripped.startswith("\\index"):
for rm in lowercase_x_mult_pat.finditer(line):
# Exclude hex literals like 0x61, 0xff
ctx_start = max(0, rm.start() - 1)
if hex_literal_pat.match(line[ctx_start : rm.end()]):
continue
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="lowercase_x_multiplication",
message="Lowercase 'x' used as multiplication — use $\\times$ instead",
severity="warning",
context=rm.group(0)[:120],
)
)
# Standard regex checks
for code, pattern, message, severity in regex_checks:
for rm in pattern.finditer(line):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code=code,
message=message,
severity=severity,
context=rm.group(0)[:120],
)
)
return ValidationRunResult(
name="rendering",
description="Check for problematic rendering patterns",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
def _run_python_echo(self, root: Path) -> ValidationRunResult:
"""Ensure every ```{python} block has #| echo: false (code must not appear in output)."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
block_start_re = re.compile(r"^```\{python\}")
block_end_re = re.compile(r"^```\s*$")
# Quarto chunk option: #| echo: false (with optional whitespace)
echo_false_re = re.compile(r"#\|\s*echo\s*:\s*false", re.IGNORECASE)
for file in files:
lines = self._read_text(file).splitlines()
i = 0
while i < len(lines):
line = lines[i]
if not block_start_re.match(line):
i += 1
continue
start_line = i + 1
found_echo_false = False
j = i + 1
# Scan option lines: #| key: value, or blank, until we hit code or closing ```
while j < len(lines):
next_line = lines[j]
if block_end_re.match(next_line):
break
stripped = next_line.strip()
if echo_false_re.search(stripped):
found_echo_false = True
break
# Option line or blank — keep scanning
if stripped.startswith("#|") or not stripped:
j += 1
continue
# Non-option line (actual code or comment) — options are done
break
if not found_echo_false:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=start_line,
code="python_missing_echo_false",
message="Python block must include #| echo: false — code must not appear in rendered output",
severity="error",
context="Add #| echo: false as first line after ```{python}",
)
)
# Advance past this block to the line after closing ```
k = j
while k < len(lines) and not block_end_re.match(lines[k]):
k += 1
i = k + 1
return ValidationRunResult(
name="python-echo",
description="Check Python blocks have echo: false",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Dropcaps (ported from validate_dropcap_compat.py)
# ------------------------------------------------------------------
def _run_dropcaps(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
chapter_hdr = re.compile(r"^#\s+[^#].*\{#sec-")
numbered_h2 = re.compile(r"^##\s+[^#]")
unnumbered_h2 = re.compile(r"^##\s+.*\{.*\.unnumbered.*\}")
starts_xref = re.compile(r"^\s*@(sec|fig|tbl|lst|eq)-")
starts_link = re.compile(r"^\s*\[")
starts_inline = re.compile(r"^\s*`")
yaml_fence = re.compile(r"^---\s*$")
code_fence = re.compile(r"^```")
div_fence = re.compile(r"^:::")
blank = re.compile(r"^\s*$")
html_comment = re.compile(r"^\s*<!--")
raw_latex = re.compile(r"^\s*\\")
list_item = re.compile(r"^\s*[-*+]|\s*\d+\.")
for file in files:
lines = self._read_text(file).splitlines()
in_fm = False
in_code = False
in_div = 0
found_chapter = False
found_h2 = False
for idx, line in enumerate(lines, 1):
if idx == 1 and yaml_fence.match(line):
in_fm = True
continue
if in_fm:
if yaml_fence.match(line):
in_fm = False
continue
if code_fence.match(line):
in_code = not in_code
continue
if in_code:
continue
if div_fence.match(line):
stripped = line.strip()
if stripped == ":::":
in_div = max(0, in_div - 1)
elif stripped.startswith(":::"):
in_div += 1
continue
if in_div > 0:
continue
if chapter_hdr.match(line):
found_chapter = True
found_h2 = False
continue
if not found_chapter:
continue
if numbered_h2.match(line) and not unnumbered_h2.match(line):
if not found_h2:
found_h2 = True
continue
if not found_h2:
continue
if blank.match(line) or html_comment.match(line) or raw_latex.match(line) or list_item.match(line):
continue
if line.strip().startswith("#"):
continue
# First paragraph line
if starts_xref.match(line):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="dropcap_crossref",
message="Drop cap paragraph starts with cross-reference",
severity="error",
context=line.strip()[:120],
)
)
elif starts_link.match(line):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="dropcap_link",
message="Drop cap paragraph starts with markdown link",
severity="error",
context=line.strip()[:120],
)
)
elif starts_inline.match(line):
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="dropcap_inline",
message="Drop cap paragraph starts with inline code",
severity="error",
context=line.strip()[:120],
)
)
# Only check first paragraph per file
break
return ValidationRunResult(
name="dropcaps",
description="Validate drop cap compatibility",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Parts (ported from validate_part_keys.py)
# ------------------------------------------------------------------
def _run_parts(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
part_key_pat = re.compile(r"\\part\{key:([^}]+)\}")
# Load summaries
summaries_keys: Set[str] = set()
possible_paths = [
self.config_manager.book_dir / "contents" / "parts" / "summaries.yml",
self.config_manager.book_dir / "contents" / "vol1" / "parts" / "summaries.yml",
self.config_manager.book_dir / "contents" / "vol2" / "parts" / "summaries.yml",
]
try:
import yaml
except ImportError:
return ValidationRunResult(
name="parts",
description="Validate part keys (skipped — pyyaml not installed)",
files_checked=0,
issues=[],
elapsed_ms=int((time.time() - start) * 1000),
)
for yml_path in possible_paths:
if yml_path.exists():
try:
data = yaml.safe_load(yml_path.read_text(encoding="utf-8"))
for part in data.get("parts", []):
if "key" in part:
summaries_keys.add(part["key"].lower().replace("_", "").replace("-", ""))
except Exception:
pass
if not summaries_keys:
# No summaries found — skip gracefully
return ValidationRunResult(
name="parts",
description="Validate part keys (skipped — no summaries.yml found)",
files_checked=0,
issues=[],
elapsed_ms=int((time.time() - start) * 1000),
)
for file in files:
content = self._read_text(file)
for m in part_key_pat.finditer(content):
key = m.group(1)
norm = key.lower().replace("_", "").replace("-", "")
if norm not in summaries_keys:
line_no = content[: m.start()].count("\n") + 1
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="invalid_part_key",
message=f"Part key '{key}' not found in summaries.yml",
severity="error",
context=m.group(0),
)
)
return ValidationRunResult(
name="parts",
description="Validate \\part{{key:...}} against summaries.yml",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Heading levels (detect skipped heading levels)
# ------------------------------------------------------------------
def _run_heading_levels(self, root: Path) -> ValidationRunResult:
"""Detect heading level skips outside of div contexts.
Headings inside Quarto divs (callouts, panels, columns, etc.) are
in a separate nesting context and are excluded from the hierarchy
check. Only headings at the top-level (div depth 0) are compared
against each other.
"""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
heading_pat = re.compile(r"^(#{1,6})\s+")
code_fence = re.compile(r"^```")
yaml_fence = re.compile(r"^---\s*$")
# Div open: ::: or :::: (with optional class/id)
div_open_pat = re.compile(r"^(:{3,})\s*\{")
# Div close: bare ::: or :::: on its own line
div_close_pat = re.compile(r"^(:{3,})\s*$")
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
in_yaml = False
prev_level = 0
div_depth = 0
for idx, line in enumerate(lines, 1):
stripped = line.strip()
# Track YAML front matter
if idx == 1 and yaml_fence.match(line):
in_yaml = True
continue
if in_yaml:
if yaml_fence.match(line):
in_yaml = False
continue
# Track code blocks
if code_fence.match(stripped):
in_code = not in_code
continue
if in_code:
continue
# Track div nesting depth
if div_open_pat.match(stripped):
div_depth += 1
continue
if div_close_pat.match(stripped) and div_depth > 0:
div_depth -= 1
continue
# Skip headings inside divs — they're in a nested context
if div_depth > 0:
continue
m = heading_pat.match(line)
if not m:
continue
level = len(m.group(1))
# Only flag if we skip a level going deeper
# (e.g., ## -> #### skips ###)
if prev_level > 0 and level > prev_level + 1:
skipped = ", ".join(
f"H{i}" for i in range(prev_level + 1, level)
)
heading_text = line.lstrip("#").strip()
# Truncate at { to remove attributes
if "{" in heading_text:
heading_text = heading_text[: heading_text.index("{")].strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="heading_level_skip",
message=f"Heading jumps from H{prev_level} to H{level} (skips {skipped})",
severity="warning",
context=heading_text[:80],
)
)
prev_level = level
return ValidationRunResult(
name="heading-levels",
description="Detect skipped heading levels (e.g., ## to ####)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Duplicate consecutive words (detect "the the", "is is", etc.)
# ------------------------------------------------------------------
_DUPE_WORD_PAT = re.compile(
r"\b(\w{2,})\s+\1\b",
re.IGNORECASE,
)
# Known false positives: intentional repetitions
_DUPE_WORD_ALLOW = frozenset({
"had", "that", "do", "bye", "bla", "cha", "go",
"log", # "log log n" is valid math
})
def _run_duplicate_words(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
code_fence = re.compile(r"^```")
yaml_fence = re.compile(r"^---\s*$")
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
in_yaml = False
for idx, line in enumerate(lines, 1):
# Track YAML front matter
if idx == 1 and yaml_fence.match(line):
in_yaml = True
continue
if in_yaml:
if yaml_fence.match(line):
in_yaml = False
continue
# Skip code blocks
if code_fence.match(line.strip()):
in_code = not in_code
continue
if in_code:
continue
# Skip HTML comments, raw LaTeX, div fences, HTML tags
stripped = line.strip()
if stripped.startswith("<!--") or stripped.startswith("\\") or stripped.startswith(":::"):
continue
if stripped.startswith("<") and not stripped.startswith("<http"):
continue
# Skip lines that are mostly attributes/metadata
if stripped.startswith("#|") or stripped.startswith("%%|"):
continue
for m in self._DUPE_WORD_PAT.finditer(line):
word = m.group(1).lower()
if word in self._DUPE_WORD_ALLOW:
continue
# Skip if inside a LaTeX command or attribute
before = line[: m.start()]
if before.rstrip().endswith("\\") or "{" in line[m.start() : m.end() + 5]:
continue
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="duplicate_word",
message=f'Duplicate word: "{m.group(1)} {m.group(1)}"',
severity="warning",
context=line.strip()[:120],
)
)
return ValidationRunResult(
name="duplicate-words",
description="Detect duplicate consecutive words (typos)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Images (ported from validate_image_references.py)
# ------------------------------------------------------------------
def _run_images(self, root: Path) -> ValidationRunResult:
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
img_pat = re.compile(r"!\[(?:[^\]]|\[[^\]]*\])*\]\(([^)]+)\)(?:\{[^}]*\})?")
valid_exts = {".png", ".jpg", ".jpeg", ".gif", ".svg"}
for file in files:
content = self._read_text(file)
for m in img_pat.finditer(content):
img_path = m.group(1).strip()
if img_path.startswith(("http://", "https://")):
continue
ext = Path(img_path).suffix.lower()
if ext not in valid_exts:
continue
resolved = (file.parent / img_path).resolve()
line_no = content[: m.start()].count("\n") + 1
if not resolved.exists():
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="missing_image",
message=f"Image not found: {img_path}",
severity="error",
context=img_path,
)
)
else:
# Case check
try:
actual = self._realcase(str(resolved))
if str(resolved) != actual:
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=line_no,
code="image_case_mismatch",
message=f"Image case mismatch: ref='{Path(str(resolved)).name}' disk='{Path(actual).name}'",
severity="error",
context=img_path,
)
)
except (FileNotFoundError, OSError):
pass
return ValidationRunResult(
name="images",
description="Validate image references exist on disk",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
@staticmethod
def _realcase(path: str) -> str:
"""Resolve actual case of a path on disk."""
dirname, basename = os.path.split(path)
if dirname == path:
return dirname
dirname = ValidateCommand._realcase(dirname)
norm_base = os.path.normcase(basename)
try:
for child in os.listdir(dirname):
if os.path.normcase(child) == norm_base:
return os.path.join(dirname, child)
except OSError:
pass
return path
# ------------------------------------------------------------------
# Self-referential sections (ported from check_self_referential_sections.py)
# ------------------------------------------------------------------
def _run_self_referential(self, root: Path) -> ValidationRunResult:
"""Detect sections that reference themselves, their parent, or child."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
heading_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s+\{#([^}]+)\})?$")
ref_pat = re.compile(r"@(sec-[a-zA-Z0-9-]+)")
for file in files:
lines = self._read_text(file).splitlines()
# Build heading hierarchy
headings: List[Dict] = []
parent_stack: Dict[int, Dict] = {}
for idx, line in enumerate(lines, 1):
m = heading_pat.match(line)
if not m:
continue
level = len(m.group(1))
title = m.group(2).strip()
sec_id = m.group(3)
parent_id = None
for plevel in range(level - 1, 0, -1):
if plevel in parent_stack:
parent_id = parent_stack[plevel].get("id")
break
hd = {"level": level, "title": title, "id": sec_id,
"line": idx, "parent_id": parent_id}
headings.append(hd)
parent_stack[level] = hd
parent_stack = {k: v for k, v in parent_stack.items() if k <= level}
# Build section map and children map
section_map: Dict[str, Dict] = {}
children_map: Dict[str, List[str]] = defaultdict(list)
for hd in headings:
if hd["id"]:
section_map[hd["id"]] = hd
if hd["parent_id"]:
children_map[hd["parent_id"]].append(hd["id"])
# Check references
for idx, line in enumerate(lines, 1):
for m in ref_pat.finditer(line):
ref_id = m.group(1)
# Find which section this line belongs to
current = None
for hd in headings:
if hd["line"] <= idx:
current = hd
else:
break
if not current or not current["id"]:
continue
cur_id = current["id"]
if ref_id == cur_id:
issues.append(ValidationIssue(
file=self._relative_file(file), line=idx,
code="self_reference",
message=f"Section '{current['title']}' references itself (@{ref_id})",
severity="warning",
context=line.strip()[:120],
))
elif current["parent_id"] == ref_id:
parent = section_map.get(ref_id)
ptitle = parent["title"] if parent else ref_id
issues.append(ValidationIssue(
file=self._relative_file(file), line=idx,
code="parent_reference",
message=f"Section '{current['title']}' references its parent '{ptitle}' (@{ref_id})",
severity="warning",
context=line.strip()[:120],
))
elif ref_id in children_map.get(cur_id, []):
child = section_map.get(ref_id)
ctitle = child["title"] if child else ref_id
issues.append(ValidationIssue(
file=self._relative_file(file), line=idx,
code="child_reference",
message=f"Section '{current['title']}' references its child '{ctitle}' (@{ref_id})",
severity="warning",
context=line.strip()[:120],
))
return ValidationRunResult(
name="self-referential",
description="Detect self-referential section references",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Figure label underscores (ported from check_fig_references.py)
# ------------------------------------------------------------------
def _run_fig_label_underscores(self, root: Path) -> ValidationRunResult:
"""Find figure references containing underscores (invalid in Quarto)."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
fig_ref_pat = re.compile(r"(?:\{#|@)fig-([^}\s]+)")
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
if line.strip().startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for m in fig_ref_pat.finditer(line):
label_suffix = m.group(1)
if "_" in label_suffix:
issues.append(ValidationIssue(
file=self._relative_file(file), line=idx,
code="fig_label_underscore",
message=f"Figure label contains underscore: fig-{label_suffix} (use hyphens)",
severity="error",
context=line.strip()[:120],
))
return ValidationRunResult(
name="fig-labels",
description="Detect underscores in figure labels",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# ASCII check (ported from check_ascii.py)
# ------------------------------------------------------------------
def _run_ascii(self, root: Path) -> ValidationRunResult:
"""Find non-ASCII Unicode characters in QMD files."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
non_ascii_pat = re.compile(r"[^\x00-\x7F]")
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
# Skip LaTeX raw blocks and HTML comments
if stripped.startswith("\\") or stripped.startswith("<!--"):
continue
for m in non_ascii_pat.finditer(line):
char = m.group(0)
col = m.start()
context = line[max(0, col - 10):min(len(line), col + 10)]
issues.append(ValidationIssue(
file=self._relative_file(file), line=idx,
code="non_ascii",
message=f"Non-ASCII character '{char}' (U+{ord(char):04X})",
severity="warning",
context=context.strip(),
))
return ValidationRunResult(
name="ascii",
description="Detect non-ASCII characters in QMD files",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Percent spacing (no space between number/str and %)
# ------------------------------------------------------------------
PERCENT_SPACING_PATTERN = re.compile(r"`[^`]*`\s+%")
def _run_percent_spacing(self, root: Path) -> ValidationRunResult:
"""Flag space between inline expression and % (e.g. `{python} x` % → use `{python} x`%)."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for m in self.PERCENT_SPACING_PATTERN.finditer(line):
context = line[max(0, m.start() - 5) : min(len(line), m.end() + 10)].strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="percent_spacing",
message="Remove space between value and % (use e.g. `{python} x`% not `{python} x` %)",
severity="error",
context=context,
)
)
return ValidationRunResult(
name="percent-spacing",
description="No space between inline value and % in QMD prose",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Unit spacing (style: "100 ms", "4 GB" — never "100ms" or "4GB")
# ------------------------------------------------------------------
# Number (optional decimal) immediately followed by unit with no space (invalid per book-prose.md).
UNIT_SPACING_PATTERN = re.compile(
r"\d+(?:\.\d+)?"
r"(?:ms|GB|TB|MB|KB|Gbps|Mbps|Tbps|TFLOPS|GFLOPS|W)\b"
)
def _run_unit_spacing(self, root: Path) -> ValidationRunResult:
"""Flag number+unit with no space (e.g. 100ms → 100 ms, 4GB → 4 GB)."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for m in self.UNIT_SPACING_PATTERN.finditer(line):
context = line[max(0, m.start() - 2) : min(len(line), m.end() + 5)].strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="unit_spacing",
message="Insert space between number and unit (e.g. 100 ms not 100ms, 4 GB not 4GB)",
severity="warning",
context=context,
)
)
return ValidationRunResult(
name="unit-spacing",
description="Require space between number and unit (book-prose.md)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Binary units (style: "GB" and "TB", not "GiB" or "TiB" in prose)
# ------------------------------------------------------------------
BINARY_UNITS_PATTERN = re.compile(r"\b(GiB|TiB|MiB|KiB)\b")
def _run_binary_units(self, root: Path) -> ValidationRunResult:
"""Flag GiB/TiB in prose — use GB/TB per book-prose.md."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for m in self.BINARY_UNITS_PATTERN.finditer(line):
context = line[max(0, m.start() - 3) : min(len(line), m.end() + 3)].strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="binary_units",
message="Use GB/TB not GiB/TiB in prose (book-prose.md)",
severity="warning",
context=context,
)
)
return ValidationRunResult(
name="binary-units",
description="No GiB/TiB in prose — use GB/TB",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Contractions (forbidden in body prose per book-prose.md)
# ------------------------------------------------------------------
CONTRACTIONS_PATTERN = re.compile(
r"\b(can't|don't|it's|we'll|won't|hasn't|haven't|isn't|aren't|wasn't|weren't|"
r"doesn't|didn't|wouldn't|couldn't|shouldn't|that's|there's|here's|what's)\b",
re.IGNORECASE,
)
def _run_contractions(self, root: Path) -> ValidationRunResult:
"""Flag contractions in prose — use full forms (cannot, do not, etc.)."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
if stripped.startswith("|") or stripped.startswith("<!--"):
continue
for m in self.CONTRACTIONS_PATTERN.finditer(line):
context = line[max(0, m.start() - 2) : min(len(line), m.end() + 2)].strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="contractions",
message="Contractions forbidden in body prose — use full form (e.g. cannot, do not)",
severity="warning",
context=context,
)
)
return ValidationRunResult(
name="contractions",
description="No contractions in body prose (book-prose.md)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Unblended prose (paragraph split with leading space after period)
# ------------------------------------------------------------------
def _run_unblended_prose(self, root: Path) -> ValidationRunResult:
"""Flag line starting with single space after previous line ended with period."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for i in range(1, len(lines)):
if lines[i - 1].strip().startswith("```"):
in_code = not in_code
if in_code:
continue
prev = lines[i - 1].strip()
curr = lines[i]
if not prev.endswith("."):
continue
if not (len(curr) > 1 and curr[0] == " " and curr[1].isupper()):
continue
context = (curr[:60] + "") if len(curr) > 60 else curr
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=i + 1,
code="unblended_prose",
message="Paragraph likely split: line starts with space after period — merge into one paragraph",
severity="warning",
context=context.strip(),
)
)
return ValidationRunResult(
name="unblended-prose",
description="Detect wrongly split paragraphs (leading space after period)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Times spacing (space after $\\times$ before word/unit per book-prose.md)
# ------------------------------------------------------------------
# $\times$ or $\times$ followed immediately by letter or ( with no space.
TIMES_SPACING_PATTERN = re.compile(r"\$\\times\s*\$\s*[a-zA-Z\(]")
def _run_times_spacing(self, root: Path) -> ValidationRunResult:
"""Flag $\\times$ immediately followed by word/paren with no space."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
for file in files:
lines = self._read_text(file).splitlines()
in_code = False
for idx, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("```"):
in_code = not in_code
continue
if in_code:
continue
for m in self.TIMES_SPACING_PATTERN.finditer(line):
context = line[max(0, m.start() - 2) : min(len(line), m.end() + 10)].strip()
issues.append(
ValidationIssue(
file=self._relative_file(file),
line=idx,
code="times_spacing",
message="Add space after $\\times$ before word or unit (e.g. $\\times$ speedup)",
severity="warning",
context=context,
)
)
return ValidationRunResult(
name="times-spacing",
description="Space after $\\times$ before word/unit (book-prose.md)",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Cross-chapter footnote duplicates (ported from audit_footnotes_cross_chapter.py)
# ------------------------------------------------------------------
def _run_footnote_cross_chapter(self, root: Path) -> ValidationRunResult:
"""Find duplicate footnote IDs across chapters."""
start = time.time()
files = self._qmd_files(root)
issues: List[ValidationIssue] = []
fn_def_pat = re.compile(r"\[\^(fn-[^\]]+)\]:\s*(.+?)(?=\n\n|\n\[\^|\Z)", re.DOTALL)
# Collect all footnotes by ID
footnotes_by_id: Dict[str, List[Tuple[Path, str]]] = defaultdict(list)
for file in files:
content = self._read_text(file)
for m in fn_def_pat.finditer(content):
fn_id = m.group(1)
fn_content = " ".join(m.group(2).split())[:200]
footnotes_by_id[fn_id].append((file, fn_content))
# Report duplicates
for fn_id, occurrences in footnotes_by_id.items():
if len(occurrences) <= 1:
continue
for file, content in occurrences:
line_no = self._line_for_token(self._read_text(file), f"[^{fn_id}]:")
issues.append(ValidationIssue(
file=self._relative_file(file), line=line_no,
code="cross_chapter_footnote",
message=f"Footnote [^{fn_id}] also defined in {len(occurrences) - 1} other file(s)",
severity="warning",
context=content[:80],
))
return ValidationRunResult(
name="cross-chapter-footnotes",
description="Detect duplicate footnote IDs across chapters",
files_checked=len(files),
issues=issues,
elapsed_ms=int((time.time() - start) * 1000),
)
# ------------------------------------------------------------------
# Table content validation (delegated to validate_tables.py)
# ------------------------------------------------------------------
def _run_table_content(self, root: Path) -> ValidationRunResult:
"""Validate grid table content (bare pipes, fracs, HTML entities, etc.)."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "content" / "validate_tables.py"
)
args = ["-d", str(root)]
return self._delegate_script(script, args, "table-content")
# ------------------------------------------------------------------
# Spelling checks (delegated to check_prose_spelling.py / check_tikz_spelling.py)
# ------------------------------------------------------------------
def _run_spelling_prose(self, root: Path) -> ValidationRunResult:
"""Spell check prose text (requires aspell)."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "content" / "check_prose_spelling.py"
)
return self._delegate_script(script, [str(root)], "spelling-prose")
def _run_spelling_tikz(self, root: Path) -> ValidationRunResult:
"""Spell check TikZ diagram text (requires aspell)."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "content" / "check_tikz_spelling.py"
)
# check_tikz_spelling.py auto-scans from repo root, so pass no args
return self._delegate_script(script, [], "spelling-tikz")
# ------------------------------------------------------------------
# EPUB validation (delegated to validate_epub.py)
# ------------------------------------------------------------------
def _run_epub(self, root: Path) -> ValidationRunResult:
"""Validate EPUB file structure and content."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "utilities" / "validate_epub.py"
)
# Find EPUB files in build output directories
book_dir = Path(__file__).resolve().parent.parent.parent
epub_files = list(book_dir.rglob("*.epub"))
if not epub_files:
return ValidationRunResult(
name="epub", description="EPUB validation (no .epub files found)",
files_checked=0, issues=[], elapsed_ms=0,
)
# Validate the most recent EPUB
epub_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
return self._delegate_script(script, ["--quick", str(epub_files[0])], "epub")
# ------------------------------------------------------------------
# Content tree: require shared/ and frontmatter/ (not only vol1/vol2)
# ------------------------------------------------------------------
# Required paths under contents/ so that scripts don't assume only vol1/vol2 exist.
CONTENT_TREE_REQUIRED: List[tuple] = [
("shared", True), # (path relative to contents, is_dir)
("shared/notation.qmd", False),
("frontmatter", True),
]
def _run_content_tree(self, root: Path) -> ValidationRunResult:
"""Ensure contents/ has shared/ and frontmatter/; fail if they are missing."""
t0 = time.time()
# Resolve to contents dir: root may be contents, or contents/vol1, or contents/vol2
if root.name in ("vol1", "vol2") and root.parent.name == "contents":
contents_dir = root.parent
else:
contents_dir = root
if not (contents_dir / "vol1").is_dir() or not (contents_dir / "vol2").is_dir():
# Not the book contents root; skip (e.g. user passed a chapter path)
return ValidationRunResult(
name="content-tree",
description="Content tree (shared/frontmatter required)",
files_checked=0,
issues=[],
elapsed_ms=int((time.time() - t0) * 1000),
)
issues: List[ValidationIssue] = []
for rel, is_dir in self.CONTENT_TREE_REQUIRED:
path = contents_dir / rel
if is_dir:
if not path.is_dir():
issues.append(
ValidationIssue(
file=str(path),
line=0,
code="content-tree",
message=f"Required directory missing: contents/{rel} (shared content used by both volumes)",
severity="error",
)
)
else:
if not path.is_file():
issues.append(
ValidationIssue(
file=str(path),
line=0,
code="content-tree",
message=f"Required file missing: contents/{rel}",
severity="error",
)
)
elapsed = int((time.time() - t0) * 1000)
return ValidationRunResult(
name="content-tree",
description="Content tree (shared/frontmatter required)",
files_checked=len(self.CONTENT_TREE_REQUIRED),
issues=issues,
elapsed_ms=elapsed,
)
# ------------------------------------------------------------------
# Source citation validation (delegated to manage_sources.py)
# ------------------------------------------------------------------
def _run_sources(self, root: Path) -> ValidationRunResult:
"""Validate source citations (asterisk sources, formatting, etc.)."""
import subprocess as _sp
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "utilities" / "manage_sources.py"
)
# manage_sources.py expects to be run from the quarto root (where contents/ lives)
quarto_dir = Path(__file__).resolve().parent.parent.parent / "quarto"
t0 = time.time()
cmd = ["python3", str(script), "--problems"]
try:
result = _sp.run(cmd, capture_output=True, text=True, timeout=120, cwd=str(quarto_dir))
elapsed = int((time.time() - t0) * 1000)
if result.returncode == 0:
return ValidationRunResult(
name="sources", description="Source citation validation",
files_checked=0, issues=[], elapsed_ms=elapsed,
)
output = (result.stdout + result.stderr).strip()
return ValidationRunResult(
name="sources", description="Source citation validation",
files_checked=0, elapsed_ms=elapsed,
issues=[ValidationIssue(
file="(script output)", line=0, code="sources",
message=output[:500] if output else f"Script exited with code {result.returncode}",
severity="error",
)],
)
except FileNotFoundError:
elapsed = int((time.time() - t0) * 1000)
return ValidationRunResult(
name="sources", description="Source citation validation",
files_checked=0, elapsed_ms=elapsed,
issues=[ValidationIssue(
file=str(script), line=0, code="sources",
message=f"Script not found: {script}", severity="error",
)],
)
def _run_check_references(self, root: Path, ns: Optional[argparse.Namespace] = None) -> ValidationRunResult:
"""Validate .bib references against academic DBs (native implementation)."""
repo_root = self.config_manager.book_dir.parent.parent
if getattr(ns, "refs_file", None):
bib_paths = [Path(f) if Path(f).is_absolute() else repo_root / f for f in ns.refs_file]
else:
bib_paths = [repo_root / p for p in reference_check.DEFAULT_BIB_REL_PATHS]
output_path = Path(ns.refs_output) if getattr(ns, "refs_output", None) else None
limit = getattr(ns, "refs_limit", None)
skip_verified = getattr(ns, "refs_skip_verified", False)
thorough = getattr(ns, "refs_thorough", False)
cache_path = getattr(ns, "refs_cache", None)
if cache_path is not None:
cache_path = Path(cache_path) if Path(cache_path).is_absolute() else repo_root / cache_path
else:
cache_path = repo_root / ".references_verified.json"
only_keys: Optional[List[str]] = None
only_from_report = getattr(ns, "refs_only_from_report", None)
only_keys_file = getattr(ns, "refs_only_keys_file", None)
if only_from_report:
report_path = Path(only_from_report) if Path(only_from_report).is_absolute() else repo_root / only_from_report
if report_path.exists():
only_keys = reference_check.parse_report_keys(report_path)
else:
console.print(f"[red]Report not found: {report_path}[/red]")
return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(report_path), line=0, code="references", message=f"Report not found: {report_path}", severity="error")], elapsed_ms=0)
elif only_keys_file:
keys_path = Path(only_keys_file) if Path(only_keys_file).is_absolute() else repo_root / only_keys_file
if keys_path.exists():
only_keys = [line.strip() for line in keys_path.read_text(encoding="utf-8").splitlines() if line.strip()]
else:
console.print(f"[red]Keys file not found: {keys_path}[/red]")
return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(keys_path), line=0, code="references", message=f"Keys file not found: {keys_path}", severity="error")], elapsed_ms=0)
passed, elapsed_ms, issue_dicts, files_checked = reference_check.run(
bib_paths,
output_path=output_path,
limit=limit,
dedupe=True,
resilient=True,
console=console,
cache_path=cache_path,
skip_verified=skip_verified,
thorough=thorough,
only_keys=only_keys,
)
issues = [
ValidationIssue(
file=d["file"],
line=d["line"],
code=d["code"],
message=d["message"],
severity=d.get("severity", "error"),
)
for d in issue_dicts
]
return ValidationRunResult(
name="references",
description="Bibliography vs academic DBs (hallucinator)",
files_checked=files_checked,
issues=issues,
elapsed_ms=elapsed_ms,
)
# ------------------------------------------------------------------
# Shared helpers
# ------------------------------------------------------------------
def _line_for_token(self, content: str, token: str) -> int:
index = content.find(token)
if index < 0:
return 1
return content[:index].count("\n") + 1
def _print_human_summary(self, summary: Dict[str, Any], verbose: bool = False) -> None:
runs = summary["runs"]
total = summary["total_issues"]
status = summary["status"]
table = Table(show_header=True, header_style="bold cyan", box=None)
table.add_column("Check", style="cyan")
table.add_column("Files", style="dim")
table.add_column("Issues", style="yellow")
table.add_column("Elapsed", style="dim")
table.add_column("Status", style="white")
for run in runs:
table.add_row(
run["name"],
str(run["files_checked"]),
str(run["issue_count"]),
f'{run["elapsed_ms"]}ms',
"PASS" if run["passed"] else "FAIL",
)
console.print(Panel(table, title="Binder Check Summary", border_style="cyan"))
if total == 0:
console.print("[green]✅ All validation checks passed.[/green]")
return
# Count errors vs warnings across all runs
total_errors = 0
total_warnings = 0
for run in runs:
for issue in run["issues"]:
if issue["severity"] == "error":
total_errors += 1
else:
total_warnings += 1
for run in runs:
if run["issue_count"] == 0:
continue
run_errors = sum(1 for i in run["issues"] if i["severity"] == "error")
run_warnings = run["issue_count"] - run_errors
parts = []
if run_errors:
parts.append(f"{run_errors} error(s)")
if run_warnings:
parts.append(f"{run_warnings} warning(s)")
label = ", ".join(parts)
color = "bold red" if run_errors else "bold yellow"
console.print(f"[{color}]{run['name']}[/{color}] ({label})")
for issue in run["issues"][:30]:
line = issue["line"]
file = issue["file"]
msg = issue["message"]
sev = issue["severity"]
sev_icon = "" if sev == "error" else "⚠️"
console.print(f" {sev_icon} {file}:{line} {msg}")
if verbose and issue.get("context"):
console.print(f" [dim]{issue['context']}[/dim]")
if run["issue_count"] > 30:
console.print(f" [dim]... {run['issue_count'] - 30} more[/dim]")
console.print()
if status == "failed":
console.print(f"[red]❌ Validation failed with {total_errors} error(s).[/red]")
elif total_warnings > 0:
console.print(f"[yellow]⚠️ Passed with {total_warnings} warning(s).[/yellow]")
def _emit(self, as_json: bool, payload: Dict[str, Any], failed: bool) -> None:
if as_json:
print(json.dumps(payload, indent=2))
return
if failed:
console.print(f"[red]{payload.get('message', 'Operation failed')}[/red]")
else:
console.print(f"[green]{payload.get('message', 'Operation succeeded')}[/green]")
# ------------------------------------------------------------------
# Delegated checks (call existing scripts via subprocess)
# ------------------------------------------------------------------
@staticmethod
def _delegate_script(script_path: Path, args: List[str], run_name: str) -> ValidationRunResult:
"""Run an external script and convert its exit code to a ValidationRunResult."""
import subprocess
t0 = time.time()
cmd = ["python3", str(script_path)] + args
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
elapsed = int((time.time() - t0) * 1000)
if result.returncode == 0:
return ValidationRunResult(
name=run_name, description=run_name,
files_checked=0, issues=[], elapsed_ms=elapsed,
)
# Script failed — report its output as a single error
output = (result.stdout + result.stderr).strip()
return ValidationRunResult(
name=run_name, description=run_name,
files_checked=0, elapsed_ms=elapsed,
issues=[ValidationIssue(
file="(script output)", line=0, code=run_name,
message=output[:500] if output else f"Script exited with code {result.returncode}",
severity="error",
)],
)
except FileNotFoundError:
elapsed = int((time.time() - t0) * 1000)
return ValidationRunResult(
name=run_name, description=run_name,
files_checked=0, elapsed_ms=elapsed,
issues=[ValidationIssue(
file=str(script_path), line=0, code=run_name,
message=f"Script not found: {script_path}", severity="error",
)],
)
except subprocess.TimeoutExpired:
elapsed = int((time.time() - t0) * 1000)
return ValidationRunResult(
name=run_name, description=run_name,
files_checked=0, elapsed_ms=elapsed,
issues=[ValidationIssue(
file=str(script_path), line=0, code=run_name,
message="Script timed out after 120s", severity="error",
)],
)
def _run_grid_tables(self, root: Path) -> ValidationRunResult:
"""Check for grid tables (should be converted to pipe tables)."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "utilities" / "convert_grid_to_pipe_tables.py"
)
qmd_files = [str(f) for f in sorted(root.rglob("*.qmd"))]
if not qmd_files:
return ValidationRunResult(name="grid-tables", issues=[])
return self._delegate_script(script, ["--check"] + qmd_files, "grid-tables")
def _run_image_formats(self, root: Path) -> ValidationRunResult:
"""Validate image file formats using Pillow."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "images" / "manage_images.py"
)
image_files = []
for ext in ("*.png", "*.jpg", "*.jpeg", "*.gif"):
image_files.extend(str(f) for f in sorted(root.rglob(ext)))
if not image_files:
return ValidationRunResult(name="image-formats", issues=[])
return self._delegate_script(script, image_files, "image-formats")
def _run_external_images(self, root: Path) -> ValidationRunResult:
"""Check for external image URLs in QMD files."""
script = (
Path(__file__).resolve().parent.parent.parent
/ "tools" / "scripts" / "images" / "manage_external_images.py"
)
return self._delegate_script(
script, ["--validate", str(root)], "external-images"
)
def _run_json_syntax(self, root: Path) -> ValidationRunResult:
"""Validate JSON file syntax."""
t0 = time.time()
json_files = sorted(root.rglob("*.json"))
if not json_files:
return ValidationRunResult(
name="json-syntax", description="Validate JSON file syntax",
files_checked=0, issues=[], elapsed_ms=0,
)
issues: List[ValidationIssue] = []
for fpath in json_files:
try:
with open(fpath, "r") as f:
json.load(f)
except json.JSONDecodeError as e:
issues.append(ValidationIssue(
file=str(fpath), line=e.lineno or 0, code="json-syntax",
message=f"Invalid JSON: {e.msg}", severity="error",
))
except Exception as e:
issues.append(ValidationIssue(
file=str(fpath), line=0, code="json-syntax",
message=f"Cannot read: {e}", severity="error",
))
elapsed = int((time.time() - t0) * 1000)
return ValidationRunResult(
name="json-syntax", description="Validate JSON file syntax",
files_checked=len(json_files), issues=issues, elapsed_ms=elapsed,
)
def _run_unit_tests(self, root: Path) -> ValidationRunResult:
"""Run physics engine unit conversion tests."""
# validate.py is at book/cli/commands/validate.py
# test_units.py is at book/quarto/mlsys/test_units.py
book_dir = Path(__file__).resolve().parent.parent.parent # book/
script = book_dir / "quarto" / "mlsys" / "test_units.py"
return self._delegate_script(script, [], "unit-tests")