mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-05 17:18:48 -05:00
3120 lines
135 KiB
Python
3120 lines
135 KiB
Python
"""
|
|
Native validation commands for MLSysBook Binder CLI.
|
|
|
|
Validation logic is implemented in Binder where possible (e.g. references,
|
|
citations, labels, figures, rendering). Some checks still delegate to scripts
|
|
under book/tools/scripts/ (tables, spelling, epub, sources, grid-tables,
|
|
images). See book/cli/BINDER_NATIVE_AUDIT.md for the full list.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import time
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.table import Table
|
|
|
|
from . import reference_check
|
|
|
|
console = Console()
|
|
|
|
|
|
@dataclass
|
|
class ValidationIssue:
|
|
file: str
|
|
line: int
|
|
code: str
|
|
message: str
|
|
severity: str = "error"
|
|
context: str = ""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"file": self.file,
|
|
"line": self.line,
|
|
"code": self.code,
|
|
"message": self.message,
|
|
"severity": self.severity,
|
|
"context": self.context,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ValidationRunResult:
|
|
name: str
|
|
description: str
|
|
files_checked: int
|
|
issues: List[ValidationIssue]
|
|
elapsed_ms: int
|
|
|
|
@property
|
|
def passed(self) -> bool:
|
|
return not any(i.severity == "error" for i in self.issues)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": self.name,
|
|
"description": self.description,
|
|
"files_checked": self.files_checked,
|
|
"passed": self.passed,
|
|
"issue_count": len(self.issues),
|
|
"elapsed_ms": self.elapsed_ms,
|
|
"issues": [issue.to_dict() for issue in self.issues],
|
|
}
|
|
|
|
|
|
INLINE_REF_PATTERN = re.compile(r"`\{python\}\s+(\w+(?:\.\w+)?)`")
|
|
CELL_START_PATTERN = re.compile(r"^```\{python\}|^```python")
|
|
CELL_END_PATTERN = re.compile(r"^```\s*$")
|
|
ASSIGN_PATTERN = re.compile(r"^([A-Za-z_]\w*)\s*=")
|
|
# Tuple unpacking: "a, b = ..." — captures all names on the left side
|
|
TUPLE_ASSIGN_PATTERN = re.compile(r"^((?:[A-Za-z_]\w*\s*,\s*)+[A-Za-z_]\w*)\s*=")
|
|
CLASS_DEF_PATTERN = re.compile(r"^class\s+(\w+)\s*[:(]")
|
|
GRID_TABLE_SEP_PATTERN = re.compile(r"^\+[-:=+]+\+$")
|
|
LATEX_INLINE_PATTERN = re.compile(r"(?<!\\)\$\s*`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`|`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`\s*(?<!\\)\$")
|
|
LATEX_ADJACENT_PATTERN = re.compile(r"`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu)\$")
|
|
|
|
CITATION_REF_PATTERN = re.compile(r"@([A-Za-z0-9_:\-.]+)")
|
|
CITATION_BRACKET_PATTERN = re.compile(r"\[-?@[A-Za-z0-9_:\-.]+(?:;\s*-?@[A-Za-z0-9_:\-.]+)*\]")
|
|
|
|
LABEL_DEF_PATTERNS = {
|
|
"Figure": [
|
|
re.compile(r"\{#(fig-[\w-]+)"), # {#fig-xyz ...}
|
|
re.compile(r"#\|\s*label:\s*(fig-[\w-]+)"), # #| label: fig-xyz
|
|
re.compile(r"%%\|\s*label:\s*(fig-[\w-]+)"), # %%| label: fig-xyz (Jupyter)
|
|
],
|
|
"Table": [
|
|
re.compile(r"\{#(tbl-[\w-]+)"), # {#tbl-xyz}
|
|
re.compile(r"#\|\s*label:\s*(tbl-[\w-]+)"), # #| label: tbl-xyz
|
|
],
|
|
"Section": [
|
|
re.compile(r"\{#(sec-[\w-]+)"), # {#sec-xyz}
|
|
re.compile(r"^id:\s*(sec-[\w-]+)"), # YAML id: sec-xyz
|
|
],
|
|
"Equation": [re.compile(r"\{#(eq-[\w-]+)")], # {#eq-xyz}
|
|
"Listing": [
|
|
re.compile(r"\{#(lst-[\w-]+)"), # {#lst-xyz ...}
|
|
re.compile(r"#\|\s*label:\s*(lst-[\w-]+)"), # #| label: lst-xyz
|
|
],
|
|
}
|
|
LABEL_REF_PATTERN = re.compile(r"@((?:fig|tbl|sec|eq|lst)-[\w-]+)")
|
|
|
|
EXCLUDED_CITATION_PREFIXES = ("fig-", "tbl-", "sec-", "eq-", "lst-", "ch-", "nb-")
|
|
|
|
|
|
class ValidateCommand:
|
|
"""Native `binder check` command group (also available as `binder validate`).
|
|
|
|
Groups:
|
|
refs — inline-python, cross-refs, citations, inline patterns
|
|
labels — duplicate labels, orphaned/unreferenced labels
|
|
headers — section header IDs
|
|
footnotes — placement rules, reference integrity
|
|
figures — captions/alt-text, float flow, image files
|
|
rendering — render patterns, indexes, dropcaps, parts
|
|
all — run every check
|
|
"""
|
|
|
|
# Maps group name → list of (scope_name, runner_method_name) pairs.
|
|
# This is the single source of truth for the hierarchy.
|
|
GROUPS: Dict[str, List[tuple]] = {
|
|
"refs": [
|
|
("python-syntax", "_run_python_syntax"),
|
|
("inline-python", "_run_inline_python"),
|
|
("cross-refs", "_run_refs"),
|
|
("citations", "_run_citations"),
|
|
("inline", "_run_inline_refs"),
|
|
("self-ref", "_run_self_referential"),
|
|
],
|
|
"labels": [
|
|
("duplicates", "_run_duplicate_labels"),
|
|
("orphans", "_run_unreferenced_labels"),
|
|
("fig-labels", "_run_fig_label_underscores"),
|
|
],
|
|
"headers": [
|
|
("ids", "_run_headers"),
|
|
],
|
|
"footnotes": [
|
|
("placement", "_run_footnote_placement"),
|
|
("integrity", "_run_footnote_refs"),
|
|
("cross-chapter", "_run_footnote_cross_chapter"),
|
|
],
|
|
"figures": [
|
|
("captions", "_run_figures"),
|
|
("flow", "_run_float_flow"),
|
|
("files", "_run_images"),
|
|
],
|
|
"rendering": [
|
|
("patterns", "_run_rendering"),
|
|
("python-echo", "_run_python_echo"),
|
|
("indexes", "_run_indexes"),
|
|
("dropcaps", "_run_dropcaps"),
|
|
("parts", "_run_parts"),
|
|
("heading-levels", "_run_heading_levels"),
|
|
("duplicate-words", "_run_duplicate_words"),
|
|
("grid-tables", "_run_grid_tables"),
|
|
("tables", "_run_table_content"),
|
|
("ascii", "_run_ascii"),
|
|
("percent-spacing", "_run_percent_spacing"),
|
|
("unit-spacing", "_run_unit_spacing"),
|
|
("binary-units", "_run_binary_units"),
|
|
("contractions", "_run_contractions"),
|
|
("unblended-prose", "_run_unblended_prose"),
|
|
("times-spacing", "_run_times_spacing"),
|
|
],
|
|
"images": [
|
|
("formats", "_run_image_formats"),
|
|
("external", "_run_external_images"),
|
|
],
|
|
"json": [
|
|
("syntax", "_run_json_syntax"),
|
|
],
|
|
"units": [
|
|
("physics", "_run_unit_tests"),
|
|
],
|
|
"spelling": [
|
|
("prose", "_run_spelling_prose"),
|
|
("tikz", "_run_spelling_tikz"),
|
|
],
|
|
"epub": [
|
|
("structure", "_run_epub"),
|
|
],
|
|
"sources": [
|
|
("citations", "_run_sources"),
|
|
],
|
|
"references": [
|
|
("hallucinator", "_run_check_references"),
|
|
],
|
|
"content": [
|
|
("tree", "_run_content_tree"),
|
|
],
|
|
}
|
|
|
|
def __init__(self, config_manager, chapter_discovery):
|
|
self.config_manager = config_manager
|
|
self.chapter_discovery = chapter_discovery
|
|
|
|
def run(self, args: List[str]) -> bool:
|
|
all_group_names = list(self.GROUPS.keys()) + ["all"]
|
|
parser = argparse.ArgumentParser(
|
|
prog="binder check",
|
|
description="Run quality checks on book content",
|
|
add_help=True,
|
|
)
|
|
parser.add_argument(
|
|
"subcommand",
|
|
nargs="?",
|
|
choices=all_group_names,
|
|
help="Check group to run (refs, labels, headers, footnotes, figures, rendering, references, content, all)",
|
|
)
|
|
parser.add_argument("--scope", default=None, help="Narrow to a specific check within a group")
|
|
parser.add_argument("--path", default=None, help="File or directory path to check")
|
|
parser.add_argument("--vol1", action="store_true", help="Scope to Volume I")
|
|
parser.add_argument("--vol2", action="store_true", help="Scope to Volume II")
|
|
parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON output")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
parser.add_argument("--citations-in-code", action="store_true", help="refs: check citations in code fences")
|
|
parser.add_argument("--citations-in-raw", action="store_true", help="refs: check citations in raw blocks")
|
|
parser.add_argument("--check-patterns", action="store_true", default=True, help="refs --scope inline: include pattern hazard checks (default: on)")
|
|
parser.add_argument("--no-check-patterns", action="store_false", dest="check_patterns", help="refs --scope inline: skip pattern hazard checks")
|
|
parser.add_argument("--check-scope", action="store_true", default=False, help="refs --scope inline: detect bare variable refs in class bodies that need ClassName.attr")
|
|
parser.add_argument("--no-check-scope", action="store_false", dest="check_scope", help="refs --scope inline: skip scope analysis")
|
|
parser.add_argument("--figures", action="store_true", help="labels: filter to figures")
|
|
parser.add_argument("--tables", action="store_true", help="labels: filter to tables")
|
|
parser.add_argument("--sections", action="store_true", help="labels: filter to sections")
|
|
parser.add_argument("--equations", action="store_true", help="labels: filter to equations")
|
|
parser.add_argument("--listings", action="store_true", help="labels: filter to listings")
|
|
parser.add_argument("--all-types", action="store_true", help="labels: all label types")
|
|
parser.add_argument("-f", "--file", dest="refs_file", action="append", metavar="BIB", help="references: .bib file(s) to check")
|
|
parser.add_argument("-o", "--output", dest="refs_output", metavar="FILE", help="references: write report to FILE")
|
|
parser.add_argument("--limit", type=int, dest="refs_limit", metavar="N", help="references: check only first N refs (quick test)")
|
|
parser.add_argument("--skip-verified", dest="refs_skip_verified", action="store_true", help="references: skip refs already verified in cache")
|
|
parser.add_argument("--thorough", dest="refs_thorough", action="store_true", help="references: revalidate all refs (ignore cache)")
|
|
parser.add_argument("--refs-cache", dest="refs_cache", metavar="FILE", help="references: cache file (default: .references_verified.json in repo root)")
|
|
parser.add_argument("--only-from-report", dest="refs_only_from_report", metavar="FILE", help="references: validate only keys that had issues in this report file")
|
|
parser.add_argument("--only-keys", dest="refs_only_keys_file", metavar="FILE", help="references: validate only keys listed in FILE (one key per line)")
|
|
|
|
try:
|
|
ns = parser.parse_args(args)
|
|
except SystemExit:
|
|
# argparse uses SystemExit(0) for --help and non-zero for parse errors.
|
|
return ("-h" in args) or ("--help" in args)
|
|
|
|
if not ns.subcommand:
|
|
self._print_check_help()
|
|
return False
|
|
|
|
root_path = self._resolve_path(ns.path, ns.vol1, ns.vol2)
|
|
if not root_path.exists():
|
|
self._emit(ns.json, {"status": "error", "message": f"Path not found: {root_path}"}, failed=True)
|
|
return False
|
|
|
|
runs: List[ValidationRunResult] = []
|
|
|
|
if ns.subcommand == "all":
|
|
for group_name in self.GROUPS:
|
|
runs.extend(self._run_group(group_name, None, root_path, ns))
|
|
else:
|
|
group_name = ns.subcommand
|
|
scope = ns.scope
|
|
if scope and not any(s == scope for s, _ in self.GROUPS.get(group_name, [])):
|
|
valid = [s for s, _ in self.GROUPS[group_name]]
|
|
console.print(f"[red]Unknown scope '{scope}' for group '{group_name}'.[/red]")
|
|
console.print(f"[yellow]Valid scopes: {', '.join(valid)}[/yellow]")
|
|
return False
|
|
runs.extend(self._run_group(group_name, scope, root_path, ns))
|
|
|
|
any_failed = any(not run.passed for run in runs)
|
|
summary = {
|
|
"status": "failed" if any_failed else "passed",
|
|
"command": ns.subcommand,
|
|
"path": str(root_path),
|
|
"runs": [run.to_dict() for run in runs],
|
|
"total_issues": sum(len(run.issues) for run in runs),
|
|
}
|
|
|
|
if ns.json:
|
|
print(json.dumps(summary, indent=2))
|
|
else:
|
|
self._print_human_summary(summary, verbose=ns.verbose)
|
|
|
|
return not any_failed
|
|
|
|
# ------------------------------------------------------------------
|
|
# Group dispatch
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_group(
|
|
self,
|
|
group: str,
|
|
scope: Optional[str],
|
|
root: Path,
|
|
ns: argparse.Namespace,
|
|
) -> List[ValidationRunResult]:
|
|
"""Run all checks in *group*, or just the one matching *scope*."""
|
|
results: List[ValidationRunResult] = []
|
|
for scope_name, method_name in self.GROUPS[group]:
|
|
if scope and scope != scope_name:
|
|
continue
|
|
method = getattr(self, method_name)
|
|
# Some runners need extra kwargs
|
|
if method_name == "_run_refs":
|
|
checks_code = ns.citations_in_code or (not ns.citations_in_code and not ns.citations_in_raw)
|
|
checks_raw = ns.citations_in_raw or (not ns.citations_in_code and not ns.citations_in_raw)
|
|
results.append(method(root, citations_in_code=checks_code, citations_in_raw=checks_raw))
|
|
elif method_name == "_run_inline_refs":
|
|
results.append(method(root, check_patterns=ns.check_patterns,
|
|
check_scope=getattr(ns, 'check_scope', False)))
|
|
elif method_name in ("_run_duplicate_labels", "_run_unreferenced_labels"):
|
|
results.append(method(root, self._selected_label_types(ns)))
|
|
elif method_name == "_run_check_references":
|
|
results.append(method(root, ns))
|
|
else:
|
|
results.append(method(root))
|
|
return results
|
|
|
|
def _print_check_help(self) -> None:
|
|
"""Print a nicely formatted help for the check command."""
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("Group", style="cyan", width=14)
|
|
table.add_column("Scopes", style="yellow", width=38)
|
|
table.add_column("Description", style="white", width=32)
|
|
|
|
descriptions = {
|
|
"refs": "References, citations, inline Python, self-ref",
|
|
"labels": "Duplicate labels, orphans, fig-label underscores",
|
|
"headers": "Section header IDs ({#sec-...})",
|
|
"footnotes": "Placement, integrity, cross-chapter duplicates",
|
|
"figures": "Captions, float flow, image files",
|
|
"rendering": "Patterns, indexes, dropcaps, headings, typos, tables, ASCII",
|
|
"images": "Image file formats, external URLs",
|
|
"json": "JSON file syntax validation",
|
|
"units": "Physics engine unit conversion tests",
|
|
"spelling": "Prose and TikZ spell checking (requires aspell)",
|
|
"epub": "EPUB file validation",
|
|
"sources": "Source citation analysis and validation",
|
|
"references": "Bibliography vs academic DBs (hallucinator)",
|
|
"content": "Content tree (shared/, frontmatter/ required)",
|
|
}
|
|
for group_name, checks in self.GROUPS.items():
|
|
scopes = ", ".join(s for s, _ in checks)
|
|
desc = descriptions.get(group_name, "")
|
|
table.add_row(group_name, scopes, desc)
|
|
table.add_row("all", "(everything)", "Run all checks")
|
|
|
|
console.print(Panel(table, title="binder check <group> [--scope <name>]", border_style="cyan"))
|
|
console.print("[dim]Examples:[/dim]")
|
|
console.print(" [cyan]./binder check refs[/cyan] [dim]# all reference checks[/dim]")
|
|
console.print(" [cyan]./binder check refs --scope citations[/cyan] [dim]# only citation check[/dim]")
|
|
console.print(" [cyan]./binder check figures --vol1[/cyan] [dim]# all figure checks, Vol I[/dim]")
|
|
console.print(" [cyan]./binder check all[/cyan] [dim]# everything[/dim]")
|
|
console.print()
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
def _resolve_path(self, path_arg: Optional[str], vol1: bool, vol2: bool) -> Path:
|
|
if path_arg:
|
|
path = Path(path_arg)
|
|
if not path.is_absolute():
|
|
path = (Path.cwd() / path).resolve()
|
|
return path
|
|
base = self.config_manager.book_dir / "contents"
|
|
if vol1 and not vol2:
|
|
return base / "vol1"
|
|
if vol2 and not vol1:
|
|
return base / "vol2"
|
|
return base
|
|
|
|
def _selected_label_types(self, ns: argparse.Namespace) -> Dict[str, List[re.Pattern[str]]]:
|
|
explicit = ns.figures or ns.tables or ns.sections or ns.equations or ns.listings
|
|
if ns.all_types:
|
|
return LABEL_DEF_PATTERNS
|
|
if explicit:
|
|
selected: Dict[str, List[re.Pattern[str]]] = {}
|
|
if ns.figures:
|
|
selected["Figure"] = LABEL_DEF_PATTERNS["Figure"]
|
|
if ns.tables:
|
|
selected["Table"] = LABEL_DEF_PATTERNS["Table"]
|
|
if ns.sections:
|
|
selected["Section"] = LABEL_DEF_PATTERNS["Section"]
|
|
if ns.equations:
|
|
selected["Equation"] = LABEL_DEF_PATTERNS["Equation"]
|
|
if ns.listings:
|
|
selected["Listing"] = LABEL_DEF_PATTERNS["Listing"]
|
|
return selected
|
|
# default: all label types
|
|
return LABEL_DEF_PATTERNS
|
|
|
|
def _qmd_files(self, root: Path) -> List[Path]:
|
|
if root.is_file():
|
|
return [root] if root.suffix == ".qmd" else []
|
|
return sorted(root.rglob("*.qmd"))
|
|
|
|
def _read_text(self, path: Path) -> str:
|
|
try:
|
|
return path.read_text(encoding="utf-8")
|
|
except UnicodeDecodeError:
|
|
return path.read_text(encoding="utf-8", errors="ignore")
|
|
|
|
def _relative_file(self, path: Path) -> str:
|
|
try:
|
|
return str(path.relative_to(self.config_manager.book_dir))
|
|
except ValueError:
|
|
return str(path)
|
|
|
|
def _run_python_syntax(self, root: Path) -> ValidationRunResult:
|
|
"""Compile every ```{python} code block to catch syntax errors."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
block_start_re = re.compile(r"^```\{python\}")
|
|
block_end_re = re.compile(r"^```\s*$")
|
|
|
|
for file in files:
|
|
content = self._read_text(file)
|
|
lines = content.split("\n")
|
|
rel = str(file.relative_to(root)) if file.is_relative_to(root) else str(file)
|
|
|
|
in_block = False
|
|
block_lines: List[str] = []
|
|
block_start_line = 0
|
|
|
|
for i, line in enumerate(lines, start=1):
|
|
if block_start_re.match(line):
|
|
in_block = True
|
|
block_lines = []
|
|
block_start_line = i
|
|
continue
|
|
if in_block and block_end_re.match(line):
|
|
in_block = False
|
|
# Skip YAML-style #| directives before compiling
|
|
source_lines = [
|
|
ln for ln in block_lines
|
|
if not ln.strip().startswith("#|")
|
|
]
|
|
source = "\n".join(source_lines)
|
|
if not source.strip():
|
|
continue
|
|
try:
|
|
compile(source, f"{rel}:{block_start_line}", "exec")
|
|
except SyntaxError as exc:
|
|
err_line = block_start_line + (exc.lineno or 1)
|
|
issues.append(ValidationIssue(
|
|
file=rel,
|
|
line=err_line,
|
|
code="python_syntax",
|
|
message=f"Python syntax error: {exc.msg}",
|
|
severity="error",
|
|
context=(exc.text or "").strip()[:120],
|
|
))
|
|
continue
|
|
if in_block:
|
|
block_lines.append(line)
|
|
|
|
return ValidationRunResult(
|
|
name="python-syntax",
|
|
description="Validate Python code block syntax (compile check)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _run_inline_python(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
regex_checks = [
|
|
("missing_backtick", re.compile(r"(?<!`)(\{python\}\s+\w+`)"), "Missing opening backtick before {python}", "error"),
|
|
("dollar_as_backtick", re.compile(r"\$\{python\}\s+\w+`"), "Dollar sign used instead of backtick before {python}", "error"),
|
|
("display_math", re.compile(r"\$\$[^$]*`?\{python\}"), "Inline Python inside $$...$$ display math", "error"),
|
|
# NOTE: $\times$ adjacent to inline Python is the PREFERRED convention.
|
|
# Only flag non-_str variables inside $...$ math (decimal stripping risk).
|
|
("latex_adjacent_raw", re.compile(r"`\{python\}\s+(?!\w+_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu|le|ge|neq|pm|cdot|div)"), "Non-_str inline Python adjacent to LaTeX operator (decimal stripping risk)", "warning"),
|
|
]
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code_block = False
|
|
in_grid = False
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code_block = not in_code_block
|
|
continue
|
|
if in_code_block:
|
|
continue
|
|
|
|
for code, pattern, message, severity in regex_checks:
|
|
for match in pattern.finditer(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code=code,
|
|
message=message,
|
|
severity=severity,
|
|
context=match.group(0)[:160],
|
|
))
|
|
|
|
if LATEX_INLINE_PATTERN.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="python_in_math",
|
|
message="Inline Python inside $...$ math can render incorrectly",
|
|
severity="error",
|
|
context=line.strip()[:160],
|
|
))
|
|
|
|
if GRID_TABLE_SEP_PATTERN.match(stripped):
|
|
in_grid = True
|
|
elif in_grid and not stripped.startswith("|") and stripped:
|
|
in_grid = False
|
|
|
|
if in_grid and "`{python}" in line:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="grid_table_python",
|
|
message="Inline Python in grid table; convert to pipe table",
|
|
severity="error",
|
|
context=line.strip()[:160],
|
|
))
|
|
|
|
# Unwrapped {python} — missing backticks entirely
|
|
# Match {python} NOT preceded by ` and NOT at start of #| label line
|
|
if "{python}" in line and not stripped.startswith("#|"):
|
|
for um in re.finditer(r"(?<!`)\{python\}\s+\w+", line):
|
|
# Make sure it's not inside a backtick span
|
|
before = line[:um.start()]
|
|
if before.count("`") % 2 == 0: # even backticks = not inside span
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="unwrapped_python",
|
|
message="Inline Python missing backtick wrapping — will render as literal text",
|
|
severity="error",
|
|
context=um.group(0)[:120],
|
|
))
|
|
|
|
# Inline Python in headings — fragile for TOC/bookmarks/PDF
|
|
if stripped.startswith("#") and not stripped.startswith("#|") and "`{python}" in line:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="python_in_heading",
|
|
message="Inline Python in heading — fragile for TOC, bookmarks, and PDF",
|
|
severity="warning",
|
|
context=stripped[:120],
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="inline-python",
|
|
description="Validate inline Python syntax and placement",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _run_refs(self, root: Path, citations_in_code: bool, citations_in_raw: bool) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
fenced_code_pattern = re.compile(r"```\{([^}]+)\}(.*?)```", re.DOTALL)
|
|
raw_block_pattern = re.compile(r"```\{=(html|latex|tex)\}(.*?)```", re.DOTALL | re.IGNORECASE)
|
|
problematic_classes = {"tikz", "latex", "tex"}
|
|
|
|
for file in files:
|
|
content = self._read_text(file)
|
|
if citations_in_code:
|
|
for match in fenced_code_pattern.finditer(content):
|
|
attrs = match.group(1)
|
|
code_content = match.group(2)
|
|
class_match = re.search(r"\.([A-Za-z][A-Za-z0-9_-]*)", attrs)
|
|
cls = class_match.group(1).lower() if class_match else "unknown"
|
|
if cls not in problematic_classes:
|
|
continue
|
|
for cite_match in CITATION_BRACKET_PATTERN.finditer(code_content):
|
|
offset = match.start() + len(f"```{{{attrs}}}") + cite_match.start()
|
|
line_no = content[:offset].count("\n") + 1
|
|
line = content.splitlines()[line_no - 1] if line_no - 1 < len(content.splitlines()) else ""
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="citation_in_code",
|
|
message=f"Citation in .{cls} code block will not be processed",
|
|
severity="error",
|
|
context=line.strip()[:160],
|
|
))
|
|
|
|
if citations_in_raw:
|
|
for match in raw_block_pattern.finditer(content):
|
|
raw_type = match.group(1).lower()
|
|
block = match.group(2)
|
|
for cite_match in CITATION_BRACKET_PATTERN.finditer(block):
|
|
offset = match.start() + cite_match.start()
|
|
line_no = content[:offset].count("\n") + 1
|
|
line = content.splitlines()[line_no - 1] if line_no - 1 < len(content.splitlines()) else ""
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="citation_in_raw",
|
|
message=f"Citation in raw {raw_type} block will not be processed",
|
|
severity="error",
|
|
context=line.strip()[:160],
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="refs",
|
|
description="Validate citation/reference placement in raw/code blocks",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _bibliography_for_qmd(self, file: Path) -> Optional[Path]:
|
|
"""Resolve the volume backmatter references.bib for a .qmd from its path."""
|
|
try:
|
|
rel = file.relative_to(self.config_manager.book_dir)
|
|
except ValueError:
|
|
return None
|
|
parts = rel.parts
|
|
if "vol1" in parts:
|
|
bib_file = self.config_manager.book_dir / "contents" / "vol1" / "backmatter" / "references.bib"
|
|
elif "vol2" in parts:
|
|
bib_file = self.config_manager.book_dir / "contents" / "vol2" / "backmatter" / "references.bib"
|
|
else:
|
|
return None
|
|
return bib_file if bib_file.exists() else None
|
|
|
|
def _run_citations(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
bib_key_pattern = re.compile(r"@\w+\{([^,\s]+)")
|
|
|
|
for file in files:
|
|
bib_file = self._bibliography_for_qmd(file)
|
|
if bib_file is None:
|
|
continue
|
|
|
|
content = self._read_text(file)
|
|
bib_content = self._read_text(bib_file)
|
|
bib_keys = set(bib_key_pattern.findall(bib_content))
|
|
# Strip YAML frontmatter (--- ... --- at file top) to avoid email false positives
|
|
qmd_content_no_code = re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL)
|
|
# Strip HTML style/script blocks to avoid CSS @media false positives
|
|
qmd_content_no_code = re.sub(r"<style\b[^>]*>.*?</style>", "", qmd_content_no_code, flags=re.DOTALL)
|
|
qmd_content_no_code = re.sub(r"```.*?```", "", qmd_content_no_code, flags=re.DOTALL)
|
|
qmd_content_no_code = re.sub(r"`[^`]+`", "", qmd_content_no_code)
|
|
refs = set(CITATION_REF_PATTERN.findall(qmd_content_no_code))
|
|
refs = {r.rstrip(".,;:") for r in refs if not r.startswith(EXCLUDED_CITATION_PREFIXES)}
|
|
refs = {r for r in refs if not re.match(r"^\d+\.\d+", r)}
|
|
missing = sorted(refs - bib_keys)
|
|
for key in missing:
|
|
line_no = self._line_for_token(content, f"@{key}")
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="missing_citation",
|
|
message=f"Citation key @{key} missing in bibliography",
|
|
severity="error",
|
|
context=f"@{key}",
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="citations",
|
|
description="Validate citation keys against bibliography files",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _run_duplicate_labels(self, root: Path, label_types: Dict[str, List[re.Pattern[str]]]) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
definitions: Dict[str, List[Tuple[Path, int, str]]] = {}
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
for label_type, patterns in label_types.items():
|
|
for pattern in patterns:
|
|
for match in pattern.finditer(line):
|
|
label = match.group(1)
|
|
definitions.setdefault(label, []).append((file, idx, label_type))
|
|
|
|
for label, locations in definitions.items():
|
|
if len(locations) <= 1:
|
|
continue
|
|
for file, line_no, label_type in locations:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="duplicate_label",
|
|
message=f"Duplicate {label_type.lower()} label: {label}",
|
|
severity="error",
|
|
context=label,
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="duplicate-labels",
|
|
description="Detect duplicate label definitions",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _run_unreferenced_labels(self, root: Path, label_types: Dict[str, List[re.Pattern[str]]]) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
defined: Dict[str, Tuple[Path, int, str]] = {}
|
|
references: Dict[str, List[Tuple[Path, int]]] = {}
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
for idx, line in enumerate(lines, 1):
|
|
for label_type, patterns in label_types.items():
|
|
for pattern in patterns:
|
|
for match in pattern.finditer(line):
|
|
defined.setdefault(match.group(1), (file, idx, label_type))
|
|
|
|
for match in LABEL_REF_PATTERN.finditer(line):
|
|
label = match.group(1)
|
|
references.setdefault(label, []).append((file, idx))
|
|
|
|
# unreferenced definitions (skip section defaults, consistent with legacy behavior)
|
|
for label, (file, line_no, label_type) in defined.items():
|
|
if label_type == "Section":
|
|
continue
|
|
if label not in references:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="unreferenced_label",
|
|
message=f"{label_type} label {label} is never referenced",
|
|
severity="warning",
|
|
context=label,
|
|
))
|
|
|
|
# unresolved references
|
|
defined_labels = set(defined.keys())
|
|
for label, locations in references.items():
|
|
if label in defined_labels:
|
|
continue
|
|
for file, line_no in locations:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="unresolved_reference",
|
|
message=f"Reference @{label} has no matching label definition",
|
|
severity="error",
|
|
context=f"@{label}",
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="unreferenced-labels",
|
|
description="Detect unreferenced labels and unresolved references",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _run_inline_refs(self, root: Path, check_patterns: bool,
|
|
check_scope: bool = False) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
yaml_option_inline = re.compile(r"^#\|\s*(fig-cap|tbl-cap|lst-cap|fig-alt):\s*.*`\{python\}")
|
|
caption_syntax_inline = re.compile(r"^:\s+.*`\{python\}.*\{#(tbl|fig|lst)-")
|
|
inline_fstring = re.compile(r"`\{python\}\s*f\"[^`]+`")
|
|
inline_func_call = re.compile(r"`\{python\}\s*\w+\([^`]+\)`")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
refs: List[Tuple[int, str]] = []
|
|
compute_vars: Set[str] = set()
|
|
compute_classes: Set[str] = set()
|
|
in_cell = False
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
if CELL_START_PATTERN.match(line.strip()):
|
|
in_cell = True
|
|
continue
|
|
if in_cell and CELL_END_PATTERN.match(line.strip()):
|
|
in_cell = False
|
|
continue
|
|
if in_cell:
|
|
cls_match = CLASS_DEF_PATTERN.match(line.strip())
|
|
if cls_match:
|
|
compute_classes.add(cls_match.group(1))
|
|
assign = ASSIGN_PATTERN.match(line.strip())
|
|
if assign:
|
|
compute_vars.add(assign.group(1))
|
|
tuple_assign = TUPLE_ASSIGN_PATTERN.match(line.strip())
|
|
if tuple_assign:
|
|
for name in re.split(r'\s*,\s*', tuple_assign.group(1)):
|
|
compute_vars.add(name.strip())
|
|
|
|
for match in INLINE_REF_PATTERN.finditer(line):
|
|
refs.append((idx, match.group(1)))
|
|
|
|
for line_no, ref in refs:
|
|
if "." in ref:
|
|
cls_name = ref.split(".", 1)[0]
|
|
resolved = cls_name in compute_classes or cls_name in compute_vars
|
|
else:
|
|
resolved = ref in compute_vars
|
|
if not resolved:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="undefined_inline_ref",
|
|
message=f"Inline reference `{ref}` is not defined in python cells",
|
|
severity="error",
|
|
context=f"`{{python}} {ref}`",
|
|
))
|
|
|
|
if check_patterns:
|
|
in_grid = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if LATEX_INLINE_PATTERN.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="latex_math_inline_python",
|
|
message="Inline Python inside LaTeX math can strip decimals",
|
|
severity="warning",
|
|
context=stripped[:160],
|
|
))
|
|
if LATEX_ADJACENT_PATTERN.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="latex_adjacent_inline_python",
|
|
message="Inline Python adjacent to LaTeX operator is fragile",
|
|
severity="warning",
|
|
context=stripped[:160],
|
|
))
|
|
if GRID_TABLE_SEP_PATTERN.match(stripped):
|
|
in_grid = True
|
|
elif in_grid and stripped and not stripped.startswith("|"):
|
|
in_grid = False
|
|
if in_grid and "`{python}" in line:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="grid_table_inline_python",
|
|
message="Inline Python in grid tables is unsupported",
|
|
severity="error",
|
|
context=stripped[:160],
|
|
))
|
|
if inline_fstring.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="inline_fstring",
|
|
message="Inline f-string should be precomputed in Python cell",
|
|
severity="warning",
|
|
context=stripped[:160],
|
|
))
|
|
if inline_func_call.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="inline_function_call",
|
|
message="Inline function call should be precomputed in Python cell",
|
|
severity="warning",
|
|
context=stripped[:160],
|
|
))
|
|
if yaml_option_inline.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="yaml_option_inline_python",
|
|
message="Inline Python in YAML fig/tbl/lst metadata will not render",
|
|
severity="error",
|
|
context=stripped[:160],
|
|
))
|
|
if caption_syntax_inline.search(line):
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="caption_inline_python",
|
|
message="Inline Python in caption syntax will not render",
|
|
severity="error",
|
|
context=stripped[:160],
|
|
))
|
|
|
|
if check_scope:
|
|
from book.quarto.mlsys.validate_inline_refs import check_scope as _check_scope, BOOK_ROOT
|
|
try:
|
|
scope_warnings = _check_scope(file, verbose=False)
|
|
for filepath, lineno, check_type, msg in scope_warnings:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=lineno,
|
|
code=check_type.lower(),
|
|
message=msg,
|
|
severity="warning",
|
|
context="",
|
|
))
|
|
except Exception:
|
|
pass
|
|
|
|
return ValidationRunResult(
|
|
name="inline-refs",
|
|
description="Validate inline Python refs and rendering hazard patterns",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Headers (ported from manage_section_ids.py --verify)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_headers(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
header_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s*\{[^}]*\})?$")
|
|
div_start_pat = re.compile(r"^:::\s*\{\.")
|
|
div_end_pat = re.compile(r"^:::\s*$")
|
|
code_block_pat = re.compile(r"^```[^`]*$")
|
|
sec_id_pat = re.compile(r"\{#sec-[^}]+\}")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
in_div = False
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if code_block_pat.match(stripped):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
if div_start_pat.match(stripped):
|
|
in_div = True
|
|
continue
|
|
if div_end_pat.match(stripped):
|
|
in_div = False
|
|
continue
|
|
if in_div:
|
|
continue
|
|
|
|
match = header_pat.match(line)
|
|
if not match:
|
|
continue
|
|
|
|
# Extract existing attributes
|
|
existing_attrs = ""
|
|
if "{" in line:
|
|
attrs_start = line.find("{")
|
|
attrs_end = line.rfind("}")
|
|
if attrs_end > attrs_start:
|
|
existing_attrs = line[attrs_start : attrs_end + 1]
|
|
|
|
if ".unnumbered" in existing_attrs:
|
|
continue
|
|
|
|
if not sec_id_pat.search(line):
|
|
title = match.group(2).strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="missing_section_id",
|
|
message=f"Header missing section ID: {title}",
|
|
severity="error",
|
|
context=line.strip()[:160],
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="headers",
|
|
description="Verify section headers have {#sec-...} IDs",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Footnote Placement (ported from check_forbidden_footnotes.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_footnote_placement(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
fn_pat = re.compile(r"\[\^fn-[\w-]+\]")
|
|
inline_fn_pat = re.compile(r"\^\[[^\]]+\]")
|
|
table_sep_pat = re.compile(r"^\|[\s\-:+]+\|")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
div_depth = 0
|
|
div_start_line = 0
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
|
|
# Track div nesting
|
|
if re.match(r"^:{3,4}\s*\{", stripped) or re.match(r"^:{3,4}\s+\w", stripped):
|
|
div_depth += 1
|
|
if div_depth == 1:
|
|
div_start_line = idx
|
|
elif re.match(r"^:{3,4}\s*$", stripped):
|
|
if div_depth > 0:
|
|
div_depth -= 1
|
|
if div_depth == 0:
|
|
div_start_line = 0
|
|
|
|
# Check inline footnotes (always forbidden)
|
|
for m in inline_fn_pat.finditer(line):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="inline_footnote",
|
|
message=f"Inline footnote syntax; use [^fn-name] reference format",
|
|
severity="error",
|
|
context=m.group(0)[:80],
|
|
)
|
|
)
|
|
|
|
footnotes = fn_pat.findall(line)
|
|
if not footnotes:
|
|
continue
|
|
|
|
# Table cell check
|
|
if stripped.startswith("|") and stripped.count("|") >= 2 and not table_sep_pat.match(stripped):
|
|
for fn in footnotes:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="footnote_in_table",
|
|
message=f"Footnote {fn} in table cell",
|
|
severity="error",
|
|
context=stripped[:80],
|
|
)
|
|
)
|
|
|
|
# YAML caption check
|
|
if re.match(r"^\s*(fig-cap|tbl-cap):", line):
|
|
cap_type = "figure" if "fig-cap" in line else "table"
|
|
for fn in footnotes:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code=f"footnote_in_{cap_type}_caption",
|
|
message=f"Footnote {fn} in {cap_type} caption",
|
|
severity="error",
|
|
context=stripped[:80],
|
|
)
|
|
)
|
|
|
|
# Markdown caption check
|
|
if re.match(r"^:\s*\*\*[^*]+\*\*:", line):
|
|
for fn in footnotes:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="footnote_in_markdown_caption",
|
|
message=f"Footnote {fn} in markdown caption",
|
|
severity="error",
|
|
context=stripped[:80],
|
|
)
|
|
)
|
|
|
|
# Callout title check
|
|
if re.match(r"^:{3,4}\s*\{.*title=", stripped):
|
|
title_match = re.search(r'title="([^"]*)"', line)
|
|
if title_match and fn_pat.search(title_match.group(1)):
|
|
for fn in fn_pat.findall(title_match.group(1)):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="footnote_in_callout_title",
|
|
message=f"Footnote {fn} in callout title (breaks LaTeX)",
|
|
severity="error",
|
|
context=stripped[:80],
|
|
)
|
|
)
|
|
|
|
# Div block check
|
|
if div_depth > 0 and div_start_line != idx:
|
|
for fn in footnotes:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="footnote_in_div",
|
|
message=f"Footnote {fn} inside div block (started line {div_start_line})",
|
|
severity="error",
|
|
context=stripped[:80],
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="footnote-placement",
|
|
description="Check footnotes in forbidden locations",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Footnote Refs (ported from footnote_cleanup.py --validate)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_footnote_refs(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
ref_pat = re.compile(r"\[\^([^]]+)\]")
|
|
def_pat = re.compile(r"^\[\^([^]]+)\]:\s*(.+)$", re.MULTILINE)
|
|
|
|
for file in files:
|
|
content = self._read_text(file)
|
|
lines = content.split("\n")
|
|
|
|
# Collect definitions
|
|
fn_defs: Dict[str, str] = {}
|
|
for m in def_pat.finditer(content):
|
|
fn_defs[m.group(1)] = m.group(2)
|
|
|
|
# Collect references (excluding definition lines themselves)
|
|
fn_refs: Dict[str, List[int]] = defaultdict(list)
|
|
for line_num, line in enumerate(lines, 1):
|
|
for m in ref_pat.finditer(line):
|
|
fn_id = m.group(1)
|
|
dm = def_pat.match(line)
|
|
if dm and dm.group(1) == fn_id:
|
|
continue # definition line, not a reference
|
|
fn_refs[fn_id].append(line_num)
|
|
|
|
# Undefined references
|
|
for fn_id in sorted(set(fn_refs.keys()) - set(fn_defs.keys())):
|
|
first_line = fn_refs[fn_id][0]
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=first_line,
|
|
code="undefined_footnote_ref",
|
|
message=f"Undefined footnote reference: [^{fn_id}]",
|
|
severity="error",
|
|
context=f"[^{fn_id}]",
|
|
)
|
|
)
|
|
|
|
# Unused definitions
|
|
for fn_id in sorted(set(fn_defs.keys()) - set(fn_refs.keys())):
|
|
def_line = self._line_for_token(content, f"[^{fn_id}]:")
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=def_line,
|
|
code="unused_footnote_def",
|
|
message=f"Unused footnote definition: [^{fn_id}]",
|
|
severity="warning",
|
|
context=f"[^{fn_id}]:",
|
|
)
|
|
)
|
|
|
|
# Duplicate definitions
|
|
def_counts: Dict[str, int] = defaultdict(int)
|
|
for line in lines:
|
|
dm = re.match(r"^\[\^([^]]+)\]:", line)
|
|
if dm:
|
|
def_counts[dm.group(1)] += 1
|
|
for fn_id, count in def_counts.items():
|
|
if count > 1:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=self._line_for_token(content, f"[^{fn_id}]:"),
|
|
code="duplicate_footnote_def",
|
|
message=f"Duplicate footnote definition ({count}x): [^{fn_id}]",
|
|
severity="error",
|
|
context=f"[^{fn_id}]:",
|
|
)
|
|
)
|
|
|
|
# Missing blank line before footnote definition
|
|
# Pandoc requires footnote definitions to start a new block.
|
|
# Without a preceding blank line, Pandoc treats the definition
|
|
# as continuation text and renders [^fn-name] as literal text.
|
|
fn_def_line_pat = re.compile(r"^\[\^[^\]]+\]:")
|
|
for idx, line in enumerate(lines):
|
|
if fn_def_line_pat.match(line) and idx > 0:
|
|
prev = lines[idx - 1]
|
|
if prev.strip(): # previous line is not blank
|
|
fn_match = re.match(r"^\[\^([^\]]+)\]:", line)
|
|
fn_id_str = fn_match.group(1) if fn_match else "?"
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx + 1,
|
|
code="footnote_missing_blank_line",
|
|
message=(
|
|
f"Footnote definition [^{fn_id_str}] has no blank line before it — "
|
|
f"Pandoc will not parse it as a footnote"
|
|
),
|
|
severity="error",
|
|
context=f"prev: {prev.strip()[:60]}",
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="footnote-refs",
|
|
description="Validate footnote references and definitions",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Figures (ported from check_figure_completeness.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_figures(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
fig_id_pat = re.compile(r"\{#(fig-[a-zA-Z0-9_-]+)[\s}]")
|
|
md_cap_pat = re.compile(r"!\[(.+?)\]\(")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
seen_ids: Set[str] = set()
|
|
|
|
# Pass 1: attribute-based figures
|
|
for idx, line in enumerate(lines, 1):
|
|
m = fig_id_pat.search(line)
|
|
if not m:
|
|
continue
|
|
fig_id = m.group(1)
|
|
has_cap = bool(re.search(r'fig-cap="[^"]+', line))
|
|
has_alt = bool(re.search(r'fig-alt="[^"]+', line))
|
|
|
|
if "![" in line:
|
|
md_m = md_cap_pat.search(line)
|
|
if md_m and md_m.group(1).strip():
|
|
has_cap = True
|
|
|
|
seen_ids.add(fig_id)
|
|
missing = []
|
|
if not has_cap:
|
|
missing.append("caption")
|
|
if not has_alt:
|
|
missing.append("alt-text")
|
|
if missing:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="incomplete_figure",
|
|
message=f"Figure {fig_id} missing: {', '.join(missing)}",
|
|
severity="error",
|
|
context=line.strip()[:120],
|
|
)
|
|
)
|
|
|
|
# Pass 2: code-cell figures
|
|
in_code = False
|
|
code_start = 0
|
|
cell_opts: Dict[str, str] = {}
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.rstrip()
|
|
if not in_code and re.match(r"^```\{(?:python|r|julia|ojs)", stripped):
|
|
in_code = True
|
|
code_start = idx
|
|
cell_opts = {}
|
|
continue
|
|
if in_code and stripped == "```":
|
|
label = cell_opts.get("label", "")
|
|
if label.startswith("fig-") and label not in seen_ids:
|
|
cap_val = cell_opts.get("fig-cap", "")
|
|
alt_val = cell_opts.get("fig-alt", "")
|
|
missing = []
|
|
if not cap_val:
|
|
missing.append("caption")
|
|
if not alt_val:
|
|
missing.append("alt-text")
|
|
if missing:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=code_start,
|
|
code="incomplete_figure",
|
|
message=f"Figure {label} missing: {', '.join(missing)}",
|
|
severity="error",
|
|
context=f"code-cell figure {label}",
|
|
)
|
|
)
|
|
seen_ids.add(label)
|
|
in_code = False
|
|
cell_opts = {}
|
|
continue
|
|
if in_code:
|
|
opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped)
|
|
if opt_m:
|
|
val = opt_m.group(2).strip().strip("\"'")
|
|
cell_opts[opt_m.group(1)] = val
|
|
|
|
return ValidationRunResult(
|
|
name="figures",
|
|
description="Check figures have captions and alt-text",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Float Flow (ported from figure_table_flow_audit.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_float_flow(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
div_def_pat = re.compile(r":::\s*\{[^}]*#((?:fig|tbl)-[\w-]+)")
|
|
img_def_pat = re.compile(r"!\[.*?\]\(.*?\)\s*\{[^}]*#((?:fig|tbl)-[\w-]+)")
|
|
tbl_cap_pat = re.compile(r"^:\s+.*\{[^}]*#((?:fig|tbl)-[\w-]+)")
|
|
ref_pat = re.compile(r"@((?:fig|tbl)-[\w-]+)")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
defs: Dict[str, int] = {}
|
|
refs: Dict[str, List[int]] = defaultdict(list)
|
|
in_code = False
|
|
in_float = False
|
|
float_label: Optional[str] = None
|
|
code_spans: List[Tuple[int, int]] = []
|
|
code_start = 0
|
|
cell_opts: Dict[str, str] = {}
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.rstrip()
|
|
|
|
# Code block tracking
|
|
if not in_code and re.match(r"^```\{", stripped):
|
|
in_code = True
|
|
code_start = idx
|
|
cell_opts = {}
|
|
continue
|
|
if in_code and stripped == "```":
|
|
code_spans.append((code_start, idx))
|
|
label = cell_opts.get("label", "")
|
|
if label.startswith(("fig-", "tbl-")) and label not in defs:
|
|
defs[label] = code_start
|
|
in_code = False
|
|
cell_opts = {}
|
|
continue
|
|
if in_code:
|
|
opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped)
|
|
if opt_m:
|
|
cell_opts[opt_m.group(1)] = opt_m.group(2).strip().strip("\"'")
|
|
continue
|
|
|
|
# Attribute-based definitions
|
|
for pat in [div_def_pat, img_def_pat, tbl_cap_pat]:
|
|
m = pat.search(line)
|
|
if m:
|
|
label = m.group(1)
|
|
if label not in defs:
|
|
defs[label] = idx
|
|
if pat == div_def_pat:
|
|
in_float = True
|
|
float_label = label
|
|
|
|
# Track float block end
|
|
if in_float:
|
|
ls = line.strip()
|
|
if ls.startswith(":::") and not ls.startswith("::: {"):
|
|
in_float = False
|
|
float_label = None
|
|
|
|
# References
|
|
if "fig-cap=" in line or "fig-alt=" in line:
|
|
continue
|
|
for m in ref_pat.finditer(line):
|
|
label = m.group(1)
|
|
if in_float and label == float_label:
|
|
continue
|
|
refs[label].append(idx)
|
|
|
|
# Evaluate status
|
|
all_labels = set(defs.keys()) | set(refs.keys())
|
|
for label in sorted(all_labels):
|
|
def_line = defs.get(label)
|
|
ref_lines = refs.get(label, [])
|
|
first_ref = min(ref_lines) if ref_lines else None
|
|
|
|
if not def_line:
|
|
continue # XREF — informational, skip
|
|
if not first_ref:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=def_line,
|
|
code="orphan_float",
|
|
message=f"{'Figure' if label.startswith('fig-') else 'Table'} {label} defined but never referenced",
|
|
severity="warning",
|
|
context=label,
|
|
)
|
|
)
|
|
continue
|
|
|
|
# Compute prose gap
|
|
gap = def_line - first_ref
|
|
code_lines = 0
|
|
if gap > 0:
|
|
for cs, ce in code_spans:
|
|
os_ = max(first_ref, cs)
|
|
oe_ = min(def_line, ce)
|
|
if os_ <= oe_:
|
|
code_lines += oe_ - os_ + 1
|
|
prose_gap = gap - code_lines
|
|
|
|
if prose_gap > 30:
|
|
# Check closest reference
|
|
closest = min(ref_lines, key=lambda r: abs(def_line - r))
|
|
closest_gap = def_line - closest
|
|
if -5 <= closest_gap <= 30:
|
|
continue # OK
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=def_line,
|
|
code="late_float",
|
|
message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (too far after mention)",
|
|
severity="warning",
|
|
context=label,
|
|
)
|
|
)
|
|
elif prose_gap < -5:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=def_line,
|
|
code="early_float",
|
|
message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (appears before mention)",
|
|
severity="warning",
|
|
context=label,
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="float-flow",
|
|
description="Audit figure/table placement relative to first reference",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Indexes (ported from check_index_placement.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_indexes(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
checks = [
|
|
("index_on_heading", re.compile(r"^#{1,6}\s+.*\\index\{"), "\\index{} on same line as heading"),
|
|
("index_before_div", re.compile(r"\\index\{[^}]*\}:::"), "\\index{} directly before ::: (div/callout)"),
|
|
("index_after_div", re.compile(r"^::+\s+\{[^}]*\}\s*\\index\{"), "\\index{} on same line as div/callout"),
|
|
("index_before_footnote", re.compile(r"^\\index\{[^}]*\}.*\[\^[^\]]+\]:"), "\\index{} before footnote definition"),
|
|
]
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
if line.strip().startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
|
|
for code, pattern, message in checks:
|
|
# Skip fig-cap lines for index_after_div
|
|
if code == "index_after_div" and "fig-cap=" in line:
|
|
continue
|
|
if pattern.search(line):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code=code,
|
|
message=message,
|
|
severity="error",
|
|
context=line.strip()[:120],
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="indexes",
|
|
description="Check LaTeX \\index{} placement",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Rendering (ported from check_render_patterns.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_rendering(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
regex_checks = [
|
|
("missing_opening_backtick", re.compile(r"(?<!`)(\{python\}\s+\w+`)"), "Missing opening backtick on inline Python", "error"),
|
|
("dollar_before_python", re.compile(r"\$\{python\}\s+\w+`"), "Dollar sign instead of backtick before {python}", "error"),
|
|
("quad_asterisks", re.compile(r"\*{4,}"), "Quad asterisks — likely malformed bold/italic", "warning"),
|
|
("footnote_in_table", re.compile(r"^\|.*\[\^fn-[^\]]+\].*\|"), "Footnote in table cell — may break PDF", "warning"),
|
|
("double_dollar_python", re.compile(r"\$\$[^$]*`\{python\}"), "Inline Python in display math", "error"),
|
|
# Currency: unescaped $ before number can be parsed as math. Use \$ for currency (see book-prose.md).
|
|
# Match: $1,000 (comma), $4.00 (decimal), $50 million/billion/etc.
|
|
# Exclude: $1.5 \times (math), $0.5$ (inline math), $4.6 / (division).
|
|
("unescaped_currency", re.compile(
|
|
r"(?<!\\)\$[0-9]{1,3}(?:,[0-9]{3})+(?=\s(?!\s*\\times)|,[0-9]|\)|$)" # $1,000, exclude $25,000 \times
|
|
r"|(?<!\\)\$[0-9]+\.[0-9]+(?=\s(?!\s*\\times)(?!\s*/)(?!\s*-)(?!\s*\+)(?!\s*\\ll)|,[0-9]|\)|$|/)(?!\\$)" # $4.00, exclude math
|
|
r"|(?<!\\)\$[0-9]+(?=\s+(?:million|billion|thousand|M|B|K|per|each|/))" # $50 million
|
|
), "Unescaped dollar before number — use \\$ for currency", "warning"),
|
|
]
|
|
|
|
grid_sep_pat = re.compile(r"^\+[-:=+]+\+$")
|
|
math_span_pat = re.compile(r"(?<!\\)\$(?!\$)(?!`)(.+?)(?<!\\)\$")
|
|
|
|
# Lowercase 'x' used as multiplication in prose (should be $\times$).
|
|
# Matches: `...`x word, NUMx word — but NOT hex (0x61), code, fig-alt, or \index.
|
|
# The pattern requires a lowercase letter after x+space, which naturally
|
|
# excludes hardware counts like "8x A100" (uppercase after x).
|
|
lowercase_x_mult_pat = re.compile(
|
|
r"""`x\s+[a-z]""" # `...`x word (after inline python)
|
|
r"""|"""
|
|
r"""\dx\s+[a-z]""" # Nx word (digit then x then lowercase)
|
|
)
|
|
# Hex literal pattern to exclude matches like 0x61, 0xff
|
|
hex_literal_pat = re.compile(r"0x[0-9a-fA-F]")
|
|
# fig-alt lines to skip
|
|
fig_alt_pat = re.compile(r'fig-alt\s*=\s*"')
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_grid = False
|
|
in_code = False
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
|
|
# Code block tracking
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
|
|
# Grid table tracking
|
|
if grid_sep_pat.match(stripped):
|
|
in_grid = True
|
|
elif in_grid and not stripped.startswith("|") and not grid_sep_pat.match(stripped) and stripped:
|
|
in_grid = False
|
|
|
|
if in_grid and "`{python}" in line:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="grid_table_python",
|
|
message="Grid table with inline Python — convert to pipe table",
|
|
severity="error",
|
|
context=stripped[:120],
|
|
)
|
|
)
|
|
|
|
# Python inside $...$ math
|
|
for m in math_span_pat.finditer(line):
|
|
inner = m.group(1)
|
|
if "{python}" not in inner:
|
|
continue
|
|
inner_clean = re.sub(r"\^\{[^}]*`\{python\}[^`]*`[^}]*\}", "", inner)
|
|
if "{python}" in inner_clean:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="python_in_dollar_math",
|
|
message="Inline Python inside $...$ math block",
|
|
severity="error",
|
|
context=m.group(0)[:120],
|
|
)
|
|
)
|
|
|
|
# Lowercase 'x' used as multiplication in prose
|
|
# Skip fig-alt lines and index entries
|
|
if not fig_alt_pat.search(line) and not stripped.startswith("\\index"):
|
|
for rm in lowercase_x_mult_pat.finditer(line):
|
|
# Exclude hex literals like 0x61, 0xff
|
|
ctx_start = max(0, rm.start() - 1)
|
|
if hex_literal_pat.match(line[ctx_start : rm.end()]):
|
|
continue
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="lowercase_x_multiplication",
|
|
message="Lowercase 'x' used as multiplication — use $\\times$ instead",
|
|
severity="warning",
|
|
context=rm.group(0)[:120],
|
|
)
|
|
)
|
|
|
|
# Standard regex checks
|
|
for code, pattern, message, severity in regex_checks:
|
|
for rm in pattern.finditer(line):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code=code,
|
|
message=message,
|
|
severity=severity,
|
|
context=rm.group(0)[:120],
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="rendering",
|
|
description="Check for problematic rendering patterns",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
def _run_python_echo(self, root: Path) -> ValidationRunResult:
|
|
"""Ensure every ```{python} block has #| echo: false (code must not appear in output)."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
block_start_re = re.compile(r"^```\{python\}")
|
|
block_end_re = re.compile(r"^```\s*$")
|
|
# Quarto chunk option: #| echo: false (with optional whitespace)
|
|
echo_false_re = re.compile(r"#\|\s*echo\s*:\s*false", re.IGNORECASE)
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
if not block_start_re.match(line):
|
|
i += 1
|
|
continue
|
|
start_line = i + 1
|
|
found_echo_false = False
|
|
j = i + 1
|
|
# Scan option lines: #| key: value, or blank, until we hit code or closing ```
|
|
while j < len(lines):
|
|
next_line = lines[j]
|
|
if block_end_re.match(next_line):
|
|
break
|
|
stripped = next_line.strip()
|
|
if echo_false_re.search(stripped):
|
|
found_echo_false = True
|
|
break
|
|
# Option line or blank — keep scanning
|
|
if stripped.startswith("#|") or not stripped:
|
|
j += 1
|
|
continue
|
|
# Non-option line (actual code or comment) — options are done
|
|
break
|
|
if not found_echo_false:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=start_line,
|
|
code="python_missing_echo_false",
|
|
message="Python block must include #| echo: false — code must not appear in rendered output",
|
|
severity="error",
|
|
context="Add #| echo: false as first line after ```{python}",
|
|
)
|
|
)
|
|
# Advance past this block to the line after closing ```
|
|
k = j
|
|
while k < len(lines) and not block_end_re.match(lines[k]):
|
|
k += 1
|
|
i = k + 1
|
|
|
|
return ValidationRunResult(
|
|
name="python-echo",
|
|
description="Check Python blocks have echo: false",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Dropcaps (ported from validate_dropcap_compat.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_dropcaps(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
chapter_hdr = re.compile(r"^#\s+[^#].*\{#sec-")
|
|
numbered_h2 = re.compile(r"^##\s+[^#]")
|
|
unnumbered_h2 = re.compile(r"^##\s+.*\{.*\.unnumbered.*\}")
|
|
starts_xref = re.compile(r"^\s*@(sec|fig|tbl|lst|eq)-")
|
|
starts_link = re.compile(r"^\s*\[")
|
|
starts_inline = re.compile(r"^\s*`")
|
|
yaml_fence = re.compile(r"^---\s*$")
|
|
code_fence = re.compile(r"^```")
|
|
div_fence = re.compile(r"^:::")
|
|
blank = re.compile(r"^\s*$")
|
|
html_comment = re.compile(r"^\s*<!--")
|
|
raw_latex = re.compile(r"^\s*\\")
|
|
list_item = re.compile(r"^\s*[-*+]|\s*\d+\.")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_fm = False
|
|
in_code = False
|
|
in_div = 0
|
|
found_chapter = False
|
|
found_h2 = False
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
if idx == 1 and yaml_fence.match(line):
|
|
in_fm = True
|
|
continue
|
|
if in_fm:
|
|
if yaml_fence.match(line):
|
|
in_fm = False
|
|
continue
|
|
if code_fence.match(line):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
if div_fence.match(line):
|
|
stripped = line.strip()
|
|
if stripped == ":::":
|
|
in_div = max(0, in_div - 1)
|
|
elif stripped.startswith(":::"):
|
|
in_div += 1
|
|
continue
|
|
if in_div > 0:
|
|
continue
|
|
|
|
if chapter_hdr.match(line):
|
|
found_chapter = True
|
|
found_h2 = False
|
|
continue
|
|
if not found_chapter:
|
|
continue
|
|
if numbered_h2.match(line) and not unnumbered_h2.match(line):
|
|
if not found_h2:
|
|
found_h2 = True
|
|
continue
|
|
if not found_h2:
|
|
continue
|
|
if blank.match(line) or html_comment.match(line) or raw_latex.match(line) or list_item.match(line):
|
|
continue
|
|
if line.strip().startswith("#"):
|
|
continue
|
|
|
|
# First paragraph line
|
|
if starts_xref.match(line):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="dropcap_crossref",
|
|
message="Drop cap paragraph starts with cross-reference",
|
|
severity="error",
|
|
context=line.strip()[:120],
|
|
)
|
|
)
|
|
elif starts_link.match(line):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="dropcap_link",
|
|
message="Drop cap paragraph starts with markdown link",
|
|
severity="error",
|
|
context=line.strip()[:120],
|
|
)
|
|
)
|
|
elif starts_inline.match(line):
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="dropcap_inline",
|
|
message="Drop cap paragraph starts with inline code",
|
|
severity="error",
|
|
context=line.strip()[:120],
|
|
)
|
|
)
|
|
# Only check first paragraph per file
|
|
break
|
|
|
|
return ValidationRunResult(
|
|
name="dropcaps",
|
|
description="Validate drop cap compatibility",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Parts (ported from validate_part_keys.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_parts(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
part_key_pat = re.compile(r"\\part\{key:([^}]+)\}")
|
|
|
|
# Load summaries
|
|
summaries_keys: Set[str] = set()
|
|
possible_paths = [
|
|
self.config_manager.book_dir / "contents" / "parts" / "summaries.yml",
|
|
self.config_manager.book_dir / "contents" / "vol1" / "parts" / "summaries.yml",
|
|
self.config_manager.book_dir / "contents" / "vol2" / "parts" / "summaries.yml",
|
|
]
|
|
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
return ValidationRunResult(
|
|
name="parts",
|
|
description="Validate part keys (skipped — pyyaml not installed)",
|
|
files_checked=0,
|
|
issues=[],
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
for yml_path in possible_paths:
|
|
if yml_path.exists():
|
|
try:
|
|
data = yaml.safe_load(yml_path.read_text(encoding="utf-8"))
|
|
for part in data.get("parts", []):
|
|
if "key" in part:
|
|
summaries_keys.add(part["key"].lower().replace("_", "").replace("-", ""))
|
|
except Exception:
|
|
pass
|
|
|
|
if not summaries_keys:
|
|
# No summaries found — skip gracefully
|
|
return ValidationRunResult(
|
|
name="parts",
|
|
description="Validate part keys (skipped — no summaries.yml found)",
|
|
files_checked=0,
|
|
issues=[],
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
for file in files:
|
|
content = self._read_text(file)
|
|
for m in part_key_pat.finditer(content):
|
|
key = m.group(1)
|
|
norm = key.lower().replace("_", "").replace("-", "")
|
|
if norm not in summaries_keys:
|
|
line_no = content[: m.start()].count("\n") + 1
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="invalid_part_key",
|
|
message=f"Part key '{key}' not found in summaries.yml",
|
|
severity="error",
|
|
context=m.group(0),
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="parts",
|
|
description="Validate \\part{{key:...}} against summaries.yml",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Heading levels (detect skipped heading levels)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_heading_levels(self, root: Path) -> ValidationRunResult:
|
|
"""Detect heading level skips outside of div contexts.
|
|
|
|
Headings inside Quarto divs (callouts, panels, columns, etc.) are
|
|
in a separate nesting context and are excluded from the hierarchy
|
|
check. Only headings at the top-level (div depth 0) are compared
|
|
against each other.
|
|
"""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
heading_pat = re.compile(r"^(#{1,6})\s+")
|
|
code_fence = re.compile(r"^```")
|
|
yaml_fence = re.compile(r"^---\s*$")
|
|
# Div open: ::: or :::: (with optional class/id)
|
|
div_open_pat = re.compile(r"^(:{3,})\s*\{")
|
|
# Div close: bare ::: or :::: on its own line
|
|
div_close_pat = re.compile(r"^(:{3,})\s*$")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
in_yaml = False
|
|
prev_level = 0
|
|
div_depth = 0
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
|
|
# Track YAML front matter
|
|
if idx == 1 and yaml_fence.match(line):
|
|
in_yaml = True
|
|
continue
|
|
if in_yaml:
|
|
if yaml_fence.match(line):
|
|
in_yaml = False
|
|
continue
|
|
|
|
# Track code blocks
|
|
if code_fence.match(stripped):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
|
|
# Track div nesting depth
|
|
if div_open_pat.match(stripped):
|
|
div_depth += 1
|
|
continue
|
|
if div_close_pat.match(stripped) and div_depth > 0:
|
|
div_depth -= 1
|
|
continue
|
|
|
|
# Skip headings inside divs — they're in a nested context
|
|
if div_depth > 0:
|
|
continue
|
|
|
|
m = heading_pat.match(line)
|
|
if not m:
|
|
continue
|
|
|
|
level = len(m.group(1))
|
|
|
|
# Only flag if we skip a level going deeper
|
|
# (e.g., ## -> #### skips ###)
|
|
if prev_level > 0 and level > prev_level + 1:
|
|
skipped = ", ".join(
|
|
f"H{i}" for i in range(prev_level + 1, level)
|
|
)
|
|
heading_text = line.lstrip("#").strip()
|
|
# Truncate at { to remove attributes
|
|
if "{" in heading_text:
|
|
heading_text = heading_text[: heading_text.index("{")].strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="heading_level_skip",
|
|
message=f"Heading jumps from H{prev_level} to H{level} (skips {skipped})",
|
|
severity="warning",
|
|
context=heading_text[:80],
|
|
)
|
|
)
|
|
|
|
prev_level = level
|
|
|
|
return ValidationRunResult(
|
|
name="heading-levels",
|
|
description="Detect skipped heading levels (e.g., ## to ####)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Duplicate consecutive words (detect "the the", "is is", etc.)
|
|
# ------------------------------------------------------------------
|
|
|
|
_DUPE_WORD_PAT = re.compile(
|
|
r"\b(\w{2,})\s+\1\b",
|
|
re.IGNORECASE,
|
|
)
|
|
# Known false positives: intentional repetitions
|
|
_DUPE_WORD_ALLOW = frozenset({
|
|
"had", "that", "do", "bye", "bla", "cha", "go",
|
|
"log", # "log log n" is valid math
|
|
})
|
|
|
|
def _run_duplicate_words(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
code_fence = re.compile(r"^```")
|
|
yaml_fence = re.compile(r"^---\s*$")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
in_yaml = False
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
# Track YAML front matter
|
|
if idx == 1 and yaml_fence.match(line):
|
|
in_yaml = True
|
|
continue
|
|
if in_yaml:
|
|
if yaml_fence.match(line):
|
|
in_yaml = False
|
|
continue
|
|
|
|
# Skip code blocks
|
|
if code_fence.match(line.strip()):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
|
|
# Skip HTML comments, raw LaTeX, div fences, HTML tags
|
|
stripped = line.strip()
|
|
if stripped.startswith("<!--") or stripped.startswith("\\") or stripped.startswith(":::"):
|
|
continue
|
|
if stripped.startswith("<") and not stripped.startswith("<http"):
|
|
continue
|
|
# Skip lines that are mostly attributes/metadata
|
|
if stripped.startswith("#|") or stripped.startswith("%%|"):
|
|
continue
|
|
|
|
for m in self._DUPE_WORD_PAT.finditer(line):
|
|
word = m.group(1).lower()
|
|
if word in self._DUPE_WORD_ALLOW:
|
|
continue
|
|
# Skip if inside a LaTeX command or attribute
|
|
before = line[: m.start()]
|
|
if before.rstrip().endswith("\\") or "{" in line[m.start() : m.end() + 5]:
|
|
continue
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="duplicate_word",
|
|
message=f'Duplicate word: "{m.group(1)} {m.group(1)}"',
|
|
severity="warning",
|
|
context=line.strip()[:120],
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="duplicate-words",
|
|
description="Detect duplicate consecutive words (typos)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Images (ported from validate_image_references.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_images(self, root: Path) -> ValidationRunResult:
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
img_pat = re.compile(r"!\[(?:[^\]]|\[[^\]]*\])*\]\(([^)]+)\)(?:\{[^}]*\})?")
|
|
valid_exts = {".png", ".jpg", ".jpeg", ".gif", ".svg"}
|
|
|
|
for file in files:
|
|
content = self._read_text(file)
|
|
for m in img_pat.finditer(content):
|
|
img_path = m.group(1).strip()
|
|
if img_path.startswith(("http://", "https://")):
|
|
continue
|
|
ext = Path(img_path).suffix.lower()
|
|
if ext not in valid_exts:
|
|
continue
|
|
|
|
resolved = (file.parent / img_path).resolve()
|
|
line_no = content[: m.start()].count("\n") + 1
|
|
|
|
if not resolved.exists():
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="missing_image",
|
|
message=f"Image not found: {img_path}",
|
|
severity="error",
|
|
context=img_path,
|
|
)
|
|
)
|
|
else:
|
|
# Case check
|
|
try:
|
|
actual = self._realcase(str(resolved))
|
|
if str(resolved) != actual:
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=line_no,
|
|
code="image_case_mismatch",
|
|
message=f"Image case mismatch: ref='{Path(str(resolved)).name}' disk='{Path(actual).name}'",
|
|
severity="error",
|
|
context=img_path,
|
|
)
|
|
)
|
|
except (FileNotFoundError, OSError):
|
|
pass
|
|
|
|
return ValidationRunResult(
|
|
name="images",
|
|
description="Validate image references exist on disk",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
@staticmethod
|
|
def _realcase(path: str) -> str:
|
|
"""Resolve actual case of a path on disk."""
|
|
dirname, basename = os.path.split(path)
|
|
if dirname == path:
|
|
return dirname
|
|
dirname = ValidateCommand._realcase(dirname)
|
|
norm_base = os.path.normcase(basename)
|
|
try:
|
|
for child in os.listdir(dirname):
|
|
if os.path.normcase(child) == norm_base:
|
|
return os.path.join(dirname, child)
|
|
except OSError:
|
|
pass
|
|
return path
|
|
|
|
# ------------------------------------------------------------------
|
|
# Self-referential sections (ported from check_self_referential_sections.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_self_referential(self, root: Path) -> ValidationRunResult:
|
|
"""Detect sections that reference themselves, their parent, or child."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
heading_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s+\{#([^}]+)\})?$")
|
|
ref_pat = re.compile(r"@(sec-[a-zA-Z0-9-]+)")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
|
|
# Build heading hierarchy
|
|
headings: List[Dict] = []
|
|
parent_stack: Dict[int, Dict] = {}
|
|
|
|
for idx, line in enumerate(lines, 1):
|
|
m = heading_pat.match(line)
|
|
if not m:
|
|
continue
|
|
level = len(m.group(1))
|
|
title = m.group(2).strip()
|
|
sec_id = m.group(3)
|
|
parent_id = None
|
|
for plevel in range(level - 1, 0, -1):
|
|
if plevel in parent_stack:
|
|
parent_id = parent_stack[plevel].get("id")
|
|
break
|
|
hd = {"level": level, "title": title, "id": sec_id,
|
|
"line": idx, "parent_id": parent_id}
|
|
headings.append(hd)
|
|
parent_stack[level] = hd
|
|
parent_stack = {k: v for k, v in parent_stack.items() if k <= level}
|
|
|
|
# Build section map and children map
|
|
section_map: Dict[str, Dict] = {}
|
|
children_map: Dict[str, List[str]] = defaultdict(list)
|
|
for hd in headings:
|
|
if hd["id"]:
|
|
section_map[hd["id"]] = hd
|
|
if hd["parent_id"]:
|
|
children_map[hd["parent_id"]].append(hd["id"])
|
|
|
|
# Check references
|
|
for idx, line in enumerate(lines, 1):
|
|
for m in ref_pat.finditer(line):
|
|
ref_id = m.group(1)
|
|
# Find which section this line belongs to
|
|
current = None
|
|
for hd in headings:
|
|
if hd["line"] <= idx:
|
|
current = hd
|
|
else:
|
|
break
|
|
if not current or not current["id"]:
|
|
continue
|
|
|
|
cur_id = current["id"]
|
|
if ref_id == cur_id:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file), line=idx,
|
|
code="self_reference",
|
|
message=f"Section '{current['title']}' references itself (@{ref_id})",
|
|
severity="warning",
|
|
context=line.strip()[:120],
|
|
))
|
|
elif current["parent_id"] == ref_id:
|
|
parent = section_map.get(ref_id)
|
|
ptitle = parent["title"] if parent else ref_id
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file), line=idx,
|
|
code="parent_reference",
|
|
message=f"Section '{current['title']}' references its parent '{ptitle}' (@{ref_id})",
|
|
severity="warning",
|
|
context=line.strip()[:120],
|
|
))
|
|
elif ref_id in children_map.get(cur_id, []):
|
|
child = section_map.get(ref_id)
|
|
ctitle = child["title"] if child else ref_id
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file), line=idx,
|
|
code="child_reference",
|
|
message=f"Section '{current['title']}' references its child '{ctitle}' (@{ref_id})",
|
|
severity="warning",
|
|
context=line.strip()[:120],
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="self-referential",
|
|
description="Detect self-referential section references",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Figure label underscores (ported from check_fig_references.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_fig_label_underscores(self, root: Path) -> ValidationRunResult:
|
|
"""Find figure references containing underscores (invalid in Quarto)."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
fig_ref_pat = re.compile(r"(?:\{#|@)fig-([^}\s]+)")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
if line.strip().startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
for m in fig_ref_pat.finditer(line):
|
|
label_suffix = m.group(1)
|
|
if "_" in label_suffix:
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file), line=idx,
|
|
code="fig_label_underscore",
|
|
message=f"Figure label contains underscore: fig-{label_suffix} (use hyphens)",
|
|
severity="error",
|
|
context=line.strip()[:120],
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="fig-labels",
|
|
description="Detect underscores in figure labels",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# ASCII check (ported from check_ascii.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_ascii(self, root: Path) -> ValidationRunResult:
|
|
"""Find non-ASCII Unicode characters in QMD files."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
non_ascii_pat = re.compile(r"[^\x00-\x7F]")
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
# Skip LaTeX raw blocks and HTML comments
|
|
if stripped.startswith("\\") or stripped.startswith("<!--"):
|
|
continue
|
|
for m in non_ascii_pat.finditer(line):
|
|
char = m.group(0)
|
|
col = m.start()
|
|
context = line[max(0, col - 10):min(len(line), col + 10)]
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file), line=idx,
|
|
code="non_ascii",
|
|
message=f"Non-ASCII character '{char}' (U+{ord(char):04X})",
|
|
severity="warning",
|
|
context=context.strip(),
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="ascii",
|
|
description="Detect non-ASCII characters in QMD files",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Percent spacing (no space between number/str and %)
|
|
# ------------------------------------------------------------------
|
|
|
|
PERCENT_SPACING_PATTERN = re.compile(r"`[^`]*`\s+%")
|
|
|
|
def _run_percent_spacing(self, root: Path) -> ValidationRunResult:
|
|
"""Flag space between inline expression and % (e.g. `{python} x` % → use `{python} x`%)."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
for m in self.PERCENT_SPACING_PATTERN.finditer(line):
|
|
context = line[max(0, m.start() - 5) : min(len(line), m.end() + 10)].strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="percent_spacing",
|
|
message="Remove space between value and % (use e.g. `{python} x`% not `{python} x` %)",
|
|
severity="error",
|
|
context=context,
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="percent-spacing",
|
|
description="No space between inline value and % in QMD prose",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Unit spacing (style: "100 ms", "4 GB" — never "100ms" or "4GB")
|
|
# ------------------------------------------------------------------
|
|
|
|
# Number (optional decimal) immediately followed by unit with no space (invalid per book-prose.md).
|
|
UNIT_SPACING_PATTERN = re.compile(
|
|
r"\d+(?:\.\d+)?"
|
|
r"(?:ms|GB|TB|MB|KB|Gbps|Mbps|Tbps|TFLOPS|GFLOPS|W)\b"
|
|
)
|
|
|
|
def _run_unit_spacing(self, root: Path) -> ValidationRunResult:
|
|
"""Flag number+unit with no space (e.g. 100ms → 100 ms, 4GB → 4 GB)."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
for m in self.UNIT_SPACING_PATTERN.finditer(line):
|
|
context = line[max(0, m.start() - 2) : min(len(line), m.end() + 5)].strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="unit_spacing",
|
|
message="Insert space between number and unit (e.g. 100 ms not 100ms, 4 GB not 4GB)",
|
|
severity="warning",
|
|
context=context,
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="unit-spacing",
|
|
description="Require space between number and unit (book-prose.md)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Binary units (style: "GB" and "TB", not "GiB" or "TiB" in prose)
|
|
# ------------------------------------------------------------------
|
|
|
|
BINARY_UNITS_PATTERN = re.compile(r"\b(GiB|TiB|MiB|KiB)\b")
|
|
|
|
def _run_binary_units(self, root: Path) -> ValidationRunResult:
|
|
"""Flag GiB/TiB in prose — use GB/TB per book-prose.md."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
for m in self.BINARY_UNITS_PATTERN.finditer(line):
|
|
context = line[max(0, m.start() - 3) : min(len(line), m.end() + 3)].strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="binary_units",
|
|
message="Use GB/TB not GiB/TiB in prose (book-prose.md)",
|
|
severity="warning",
|
|
context=context,
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="binary-units",
|
|
description="No GiB/TiB in prose — use GB/TB",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Contractions (forbidden in body prose per book-prose.md)
|
|
# ------------------------------------------------------------------
|
|
|
|
CONTRACTIONS_PATTERN = re.compile(
|
|
r"\b(can't|don't|it's|we'll|won't|hasn't|haven't|isn't|aren't|wasn't|weren't|"
|
|
r"doesn't|didn't|wouldn't|couldn't|shouldn't|that's|there's|here's|what's)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
def _run_contractions(self, root: Path) -> ValidationRunResult:
|
|
"""Flag contractions in prose — use full forms (cannot, do not, etc.)."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
if stripped.startswith("|") or stripped.startswith("<!--"):
|
|
continue
|
|
for m in self.CONTRACTIONS_PATTERN.finditer(line):
|
|
context = line[max(0, m.start() - 2) : min(len(line), m.end() + 2)].strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="contractions",
|
|
message="Contractions forbidden in body prose — use full form (e.g. cannot, do not)",
|
|
severity="warning",
|
|
context=context,
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="contractions",
|
|
description="No contractions in body prose (book-prose.md)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Unblended prose (paragraph split with leading space after period)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_unblended_prose(self, root: Path) -> ValidationRunResult:
|
|
"""Flag line starting with single space after previous line ended with period."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for i in range(1, len(lines)):
|
|
if lines[i - 1].strip().startswith("```"):
|
|
in_code = not in_code
|
|
if in_code:
|
|
continue
|
|
prev = lines[i - 1].strip()
|
|
curr = lines[i]
|
|
if not prev.endswith("."):
|
|
continue
|
|
if not (len(curr) > 1 and curr[0] == " " and curr[1].isupper()):
|
|
continue
|
|
context = (curr[:60] + "…") if len(curr) > 60 else curr
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=i + 1,
|
|
code="unblended_prose",
|
|
message="Paragraph likely split: line starts with space after period — merge into one paragraph",
|
|
severity="warning",
|
|
context=context.strip(),
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="unblended-prose",
|
|
description="Detect wrongly split paragraphs (leading space after period)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Times spacing (space after $\\times$ before word/unit per book-prose.md)
|
|
# ------------------------------------------------------------------
|
|
|
|
# $\times$ or $\times$ followed immediately by letter or ( with no space.
|
|
TIMES_SPACING_PATTERN = re.compile(r"\$\\times\s*\$\s*[a-zA-Z\(]")
|
|
|
|
def _run_times_spacing(self, root: Path) -> ValidationRunResult:
|
|
"""Flag $\\times$ immediately followed by word/paren with no space."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
for file in files:
|
|
lines = self._read_text(file).splitlines()
|
|
in_code = False
|
|
for idx, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if stripped.startswith("```"):
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
continue
|
|
for m in self.TIMES_SPACING_PATTERN.finditer(line):
|
|
context = line[max(0, m.start() - 2) : min(len(line), m.end() + 10)].strip()
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=self._relative_file(file),
|
|
line=idx,
|
|
code="times_spacing",
|
|
message="Add space after $\\times$ before word or unit (e.g. $\\times$ speedup)",
|
|
severity="warning",
|
|
context=context,
|
|
)
|
|
)
|
|
|
|
return ValidationRunResult(
|
|
name="times-spacing",
|
|
description="Space after $\\times$ before word/unit (book-prose.md)",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Cross-chapter footnote duplicates (ported from audit_footnotes_cross_chapter.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_footnote_cross_chapter(self, root: Path) -> ValidationRunResult:
|
|
"""Find duplicate footnote IDs across chapters."""
|
|
start = time.time()
|
|
files = self._qmd_files(root)
|
|
issues: List[ValidationIssue] = []
|
|
|
|
fn_def_pat = re.compile(r"\[\^(fn-[^\]]+)\]:\s*(.+?)(?=\n\n|\n\[\^|\Z)", re.DOTALL)
|
|
|
|
# Collect all footnotes by ID
|
|
footnotes_by_id: Dict[str, List[Tuple[Path, str]]] = defaultdict(list)
|
|
|
|
for file in files:
|
|
content = self._read_text(file)
|
|
for m in fn_def_pat.finditer(content):
|
|
fn_id = m.group(1)
|
|
fn_content = " ".join(m.group(2).split())[:200]
|
|
footnotes_by_id[fn_id].append((file, fn_content))
|
|
|
|
# Report duplicates
|
|
for fn_id, occurrences in footnotes_by_id.items():
|
|
if len(occurrences) <= 1:
|
|
continue
|
|
for file, content in occurrences:
|
|
line_no = self._line_for_token(self._read_text(file), f"[^{fn_id}]:")
|
|
issues.append(ValidationIssue(
|
|
file=self._relative_file(file), line=line_no,
|
|
code="cross_chapter_footnote",
|
|
message=f"Footnote [^{fn_id}] also defined in {len(occurrences) - 1} other file(s)",
|
|
severity="warning",
|
|
context=content[:80],
|
|
))
|
|
|
|
return ValidationRunResult(
|
|
name="cross-chapter-footnotes",
|
|
description="Detect duplicate footnote IDs across chapters",
|
|
files_checked=len(files),
|
|
issues=issues,
|
|
elapsed_ms=int((time.time() - start) * 1000),
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Table content validation (delegated to validate_tables.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_table_content(self, root: Path) -> ValidationRunResult:
|
|
"""Validate grid table content (bare pipes, fracs, HTML entities, etc.)."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "content" / "validate_tables.py"
|
|
)
|
|
args = ["-d", str(root)]
|
|
return self._delegate_script(script, args, "table-content")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Spelling checks (delegated to check_prose_spelling.py / check_tikz_spelling.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_spelling_prose(self, root: Path) -> ValidationRunResult:
|
|
"""Spell check prose text (requires aspell)."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "content" / "check_prose_spelling.py"
|
|
)
|
|
return self._delegate_script(script, [str(root)], "spelling-prose")
|
|
|
|
def _run_spelling_tikz(self, root: Path) -> ValidationRunResult:
|
|
"""Spell check TikZ diagram text (requires aspell)."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "content" / "check_tikz_spelling.py"
|
|
)
|
|
# check_tikz_spelling.py auto-scans from repo root, so pass no args
|
|
return self._delegate_script(script, [], "spelling-tikz")
|
|
|
|
# ------------------------------------------------------------------
|
|
# EPUB validation (delegated to validate_epub.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_epub(self, root: Path) -> ValidationRunResult:
|
|
"""Validate EPUB file structure and content."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "utilities" / "validate_epub.py"
|
|
)
|
|
# Find EPUB files in build output directories
|
|
book_dir = Path(__file__).resolve().parent.parent.parent
|
|
epub_files = list(book_dir.rglob("*.epub"))
|
|
if not epub_files:
|
|
return ValidationRunResult(
|
|
name="epub", description="EPUB validation (no .epub files found)",
|
|
files_checked=0, issues=[], elapsed_ms=0,
|
|
)
|
|
# Validate the most recent EPUB
|
|
epub_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
return self._delegate_script(script, ["--quick", str(epub_files[0])], "epub")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Content tree: require shared/ and frontmatter/ (not only vol1/vol2)
|
|
# ------------------------------------------------------------------
|
|
|
|
# Required paths under contents/ so that scripts don't assume only vol1/vol2 exist.
|
|
CONTENT_TREE_REQUIRED: List[tuple] = [
|
|
("shared", True), # (path relative to contents, is_dir)
|
|
("shared/notation.qmd", False),
|
|
("frontmatter", True),
|
|
]
|
|
|
|
def _run_content_tree(self, root: Path) -> ValidationRunResult:
|
|
"""Ensure contents/ has shared/ and frontmatter/; fail if they are missing."""
|
|
t0 = time.time()
|
|
# Resolve to contents dir: root may be contents, or contents/vol1, or contents/vol2
|
|
if root.name in ("vol1", "vol2") and root.parent.name == "contents":
|
|
contents_dir = root.parent
|
|
else:
|
|
contents_dir = root
|
|
if not (contents_dir / "vol1").is_dir() or not (contents_dir / "vol2").is_dir():
|
|
# Not the book contents root; skip (e.g. user passed a chapter path)
|
|
return ValidationRunResult(
|
|
name="content-tree",
|
|
description="Content tree (shared/frontmatter required)",
|
|
files_checked=0,
|
|
issues=[],
|
|
elapsed_ms=int((time.time() - t0) * 1000),
|
|
)
|
|
issues: List[ValidationIssue] = []
|
|
for rel, is_dir in self.CONTENT_TREE_REQUIRED:
|
|
path = contents_dir / rel
|
|
if is_dir:
|
|
if not path.is_dir():
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=str(path),
|
|
line=0,
|
|
code="content-tree",
|
|
message=f"Required directory missing: contents/{rel} (shared content used by both volumes)",
|
|
severity="error",
|
|
)
|
|
)
|
|
else:
|
|
if not path.is_file():
|
|
issues.append(
|
|
ValidationIssue(
|
|
file=str(path),
|
|
line=0,
|
|
code="content-tree",
|
|
message=f"Required file missing: contents/{rel}",
|
|
severity="error",
|
|
)
|
|
)
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
return ValidationRunResult(
|
|
name="content-tree",
|
|
description="Content tree (shared/frontmatter required)",
|
|
files_checked=len(self.CONTENT_TREE_REQUIRED),
|
|
issues=issues,
|
|
elapsed_ms=elapsed,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Source citation validation (delegated to manage_sources.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_sources(self, root: Path) -> ValidationRunResult:
|
|
"""Validate source citations (asterisk sources, formatting, etc.)."""
|
|
import subprocess as _sp
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "utilities" / "manage_sources.py"
|
|
)
|
|
# manage_sources.py expects to be run from the quarto root (where contents/ lives)
|
|
quarto_dir = Path(__file__).resolve().parent.parent.parent / "quarto"
|
|
t0 = time.time()
|
|
cmd = ["python3", str(script), "--problems"]
|
|
try:
|
|
result = _sp.run(cmd, capture_output=True, text=True, timeout=120, cwd=str(quarto_dir))
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
if result.returncode == 0:
|
|
return ValidationRunResult(
|
|
name="sources", description="Source citation validation",
|
|
files_checked=0, issues=[], elapsed_ms=elapsed,
|
|
)
|
|
output = (result.stdout + result.stderr).strip()
|
|
return ValidationRunResult(
|
|
name="sources", description="Source citation validation",
|
|
files_checked=0, elapsed_ms=elapsed,
|
|
issues=[ValidationIssue(
|
|
file="(script output)", line=0, code="sources",
|
|
message=output[:500] if output else f"Script exited with code {result.returncode}",
|
|
severity="error",
|
|
)],
|
|
)
|
|
except FileNotFoundError:
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
return ValidationRunResult(
|
|
name="sources", description="Source citation validation",
|
|
files_checked=0, elapsed_ms=elapsed,
|
|
issues=[ValidationIssue(
|
|
file=str(script), line=0, code="sources",
|
|
message=f"Script not found: {script}", severity="error",
|
|
)],
|
|
)
|
|
|
|
def _run_check_references(self, root: Path, ns: Optional[argparse.Namespace] = None) -> ValidationRunResult:
|
|
"""Validate .bib references against academic DBs (native implementation)."""
|
|
repo_root = self.config_manager.book_dir.parent.parent
|
|
if getattr(ns, "refs_file", None):
|
|
bib_paths = [Path(f) if Path(f).is_absolute() else repo_root / f for f in ns.refs_file]
|
|
else:
|
|
bib_paths = [repo_root / p for p in reference_check.DEFAULT_BIB_REL_PATHS]
|
|
output_path = Path(ns.refs_output) if getattr(ns, "refs_output", None) else None
|
|
limit = getattr(ns, "refs_limit", None)
|
|
skip_verified = getattr(ns, "refs_skip_verified", False)
|
|
thorough = getattr(ns, "refs_thorough", False)
|
|
cache_path = getattr(ns, "refs_cache", None)
|
|
if cache_path is not None:
|
|
cache_path = Path(cache_path) if Path(cache_path).is_absolute() else repo_root / cache_path
|
|
else:
|
|
cache_path = repo_root / ".references_verified.json"
|
|
|
|
only_keys: Optional[List[str]] = None
|
|
only_from_report = getattr(ns, "refs_only_from_report", None)
|
|
only_keys_file = getattr(ns, "refs_only_keys_file", None)
|
|
if only_from_report:
|
|
report_path = Path(only_from_report) if Path(only_from_report).is_absolute() else repo_root / only_from_report
|
|
if report_path.exists():
|
|
only_keys = reference_check.parse_report_keys(report_path)
|
|
else:
|
|
console.print(f"[red]Report not found: {report_path}[/red]")
|
|
return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(report_path), line=0, code="references", message=f"Report not found: {report_path}", severity="error")], elapsed_ms=0)
|
|
elif only_keys_file:
|
|
keys_path = Path(only_keys_file) if Path(only_keys_file).is_absolute() else repo_root / only_keys_file
|
|
if keys_path.exists():
|
|
only_keys = [line.strip() for line in keys_path.read_text(encoding="utf-8").splitlines() if line.strip()]
|
|
else:
|
|
console.print(f"[red]Keys file not found: {keys_path}[/red]")
|
|
return ValidationRunResult(name="references", description="Bibliography vs academic DBs (hallucinator)", files_checked=0, issues=[ValidationIssue(file=str(keys_path), line=0, code="references", message=f"Keys file not found: {keys_path}", severity="error")], elapsed_ms=0)
|
|
|
|
passed, elapsed_ms, issue_dicts, files_checked = reference_check.run(
|
|
bib_paths,
|
|
output_path=output_path,
|
|
limit=limit,
|
|
dedupe=True,
|
|
resilient=True,
|
|
console=console,
|
|
cache_path=cache_path,
|
|
skip_verified=skip_verified,
|
|
thorough=thorough,
|
|
only_keys=only_keys,
|
|
)
|
|
issues = [
|
|
ValidationIssue(
|
|
file=d["file"],
|
|
line=d["line"],
|
|
code=d["code"],
|
|
message=d["message"],
|
|
severity=d.get("severity", "error"),
|
|
)
|
|
for d in issue_dicts
|
|
]
|
|
return ValidationRunResult(
|
|
name="references",
|
|
description="Bibliography vs academic DBs (hallucinator)",
|
|
files_checked=files_checked,
|
|
issues=issues,
|
|
elapsed_ms=elapsed_ms,
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Shared helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
def _line_for_token(self, content: str, token: str) -> int:
|
|
index = content.find(token)
|
|
if index < 0:
|
|
return 1
|
|
return content[:index].count("\n") + 1
|
|
|
|
def _print_human_summary(self, summary: Dict[str, Any], verbose: bool = False) -> None:
|
|
runs = summary["runs"]
|
|
total = summary["total_issues"]
|
|
status = summary["status"]
|
|
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("Check", style="cyan")
|
|
table.add_column("Files", style="dim")
|
|
table.add_column("Issues", style="yellow")
|
|
table.add_column("Elapsed", style="dim")
|
|
table.add_column("Status", style="white")
|
|
for run in runs:
|
|
table.add_row(
|
|
run["name"],
|
|
str(run["files_checked"]),
|
|
str(run["issue_count"]),
|
|
f'{run["elapsed_ms"]}ms',
|
|
"PASS" if run["passed"] else "FAIL",
|
|
)
|
|
console.print(Panel(table, title="Binder Check Summary", border_style="cyan"))
|
|
|
|
if total == 0:
|
|
console.print("[green]✅ All validation checks passed.[/green]")
|
|
return
|
|
|
|
# Count errors vs warnings across all runs
|
|
total_errors = 0
|
|
total_warnings = 0
|
|
for run in runs:
|
|
for issue in run["issues"]:
|
|
if issue["severity"] == "error":
|
|
total_errors += 1
|
|
else:
|
|
total_warnings += 1
|
|
|
|
for run in runs:
|
|
if run["issue_count"] == 0:
|
|
continue
|
|
run_errors = sum(1 for i in run["issues"] if i["severity"] == "error")
|
|
run_warnings = run["issue_count"] - run_errors
|
|
parts = []
|
|
if run_errors:
|
|
parts.append(f"{run_errors} error(s)")
|
|
if run_warnings:
|
|
parts.append(f"{run_warnings} warning(s)")
|
|
label = ", ".join(parts)
|
|
color = "bold red" if run_errors else "bold yellow"
|
|
console.print(f"[{color}]{run['name']}[/{color}] ({label})")
|
|
for issue in run["issues"][:30]:
|
|
line = issue["line"]
|
|
file = issue["file"]
|
|
msg = issue["message"]
|
|
sev = issue["severity"]
|
|
sev_icon = "❌" if sev == "error" else "⚠️"
|
|
console.print(f" {sev_icon} {file}:{line} {msg}")
|
|
if verbose and issue.get("context"):
|
|
console.print(f" [dim]{issue['context']}[/dim]")
|
|
if run["issue_count"] > 30:
|
|
console.print(f" [dim]... {run['issue_count'] - 30} more[/dim]")
|
|
console.print()
|
|
|
|
if status == "failed":
|
|
console.print(f"[red]❌ Validation failed with {total_errors} error(s).[/red]")
|
|
elif total_warnings > 0:
|
|
console.print(f"[yellow]⚠️ Passed with {total_warnings} warning(s).[/yellow]")
|
|
|
|
def _emit(self, as_json: bool, payload: Dict[str, Any], failed: bool) -> None:
|
|
if as_json:
|
|
print(json.dumps(payload, indent=2))
|
|
return
|
|
if failed:
|
|
console.print(f"[red]{payload.get('message', 'Operation failed')}[/red]")
|
|
else:
|
|
console.print(f"[green]{payload.get('message', 'Operation succeeded')}[/green]")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Delegated checks (call existing scripts via subprocess)
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _delegate_script(script_path: Path, args: List[str], run_name: str) -> ValidationRunResult:
|
|
"""Run an external script and convert its exit code to a ValidationRunResult."""
|
|
import subprocess
|
|
t0 = time.time()
|
|
cmd = ["python3", str(script_path)] + args
|
|
try:
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
if result.returncode == 0:
|
|
return ValidationRunResult(
|
|
name=run_name, description=run_name,
|
|
files_checked=0, issues=[], elapsed_ms=elapsed,
|
|
)
|
|
# Script failed — report its output as a single error
|
|
output = (result.stdout + result.stderr).strip()
|
|
return ValidationRunResult(
|
|
name=run_name, description=run_name,
|
|
files_checked=0, elapsed_ms=elapsed,
|
|
issues=[ValidationIssue(
|
|
file="(script output)", line=0, code=run_name,
|
|
message=output[:500] if output else f"Script exited with code {result.returncode}",
|
|
severity="error",
|
|
)],
|
|
)
|
|
except FileNotFoundError:
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
return ValidationRunResult(
|
|
name=run_name, description=run_name,
|
|
files_checked=0, elapsed_ms=elapsed,
|
|
issues=[ValidationIssue(
|
|
file=str(script_path), line=0, code=run_name,
|
|
message=f"Script not found: {script_path}", severity="error",
|
|
)],
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
return ValidationRunResult(
|
|
name=run_name, description=run_name,
|
|
files_checked=0, elapsed_ms=elapsed,
|
|
issues=[ValidationIssue(
|
|
file=str(script_path), line=0, code=run_name,
|
|
message="Script timed out after 120s", severity="error",
|
|
)],
|
|
)
|
|
|
|
def _run_grid_tables(self, root: Path) -> ValidationRunResult:
|
|
"""Check for grid tables (should be converted to pipe tables)."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "utilities" / "convert_grid_to_pipe_tables.py"
|
|
)
|
|
qmd_files = [str(f) for f in sorted(root.rglob("*.qmd"))]
|
|
if not qmd_files:
|
|
return ValidationRunResult(name="grid-tables", issues=[])
|
|
return self._delegate_script(script, ["--check"] + qmd_files, "grid-tables")
|
|
|
|
def _run_image_formats(self, root: Path) -> ValidationRunResult:
|
|
"""Validate image file formats using Pillow."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "images" / "manage_images.py"
|
|
)
|
|
image_files = []
|
|
for ext in ("*.png", "*.jpg", "*.jpeg", "*.gif"):
|
|
image_files.extend(str(f) for f in sorted(root.rglob(ext)))
|
|
if not image_files:
|
|
return ValidationRunResult(name="image-formats", issues=[])
|
|
return self._delegate_script(script, image_files, "image-formats")
|
|
|
|
def _run_external_images(self, root: Path) -> ValidationRunResult:
|
|
"""Check for external image URLs in QMD files."""
|
|
script = (
|
|
Path(__file__).resolve().parent.parent.parent
|
|
/ "tools" / "scripts" / "images" / "manage_external_images.py"
|
|
)
|
|
return self._delegate_script(
|
|
script, ["--validate", str(root)], "external-images"
|
|
)
|
|
|
|
def _run_json_syntax(self, root: Path) -> ValidationRunResult:
|
|
"""Validate JSON file syntax."""
|
|
t0 = time.time()
|
|
json_files = sorted(root.rglob("*.json"))
|
|
if not json_files:
|
|
return ValidationRunResult(
|
|
name="json-syntax", description="Validate JSON file syntax",
|
|
files_checked=0, issues=[], elapsed_ms=0,
|
|
)
|
|
issues: List[ValidationIssue] = []
|
|
for fpath in json_files:
|
|
try:
|
|
with open(fpath, "r") as f:
|
|
json.load(f)
|
|
except json.JSONDecodeError as e:
|
|
issues.append(ValidationIssue(
|
|
file=str(fpath), line=e.lineno or 0, code="json-syntax",
|
|
message=f"Invalid JSON: {e.msg}", severity="error",
|
|
))
|
|
except Exception as e:
|
|
issues.append(ValidationIssue(
|
|
file=str(fpath), line=0, code="json-syntax",
|
|
message=f"Cannot read: {e}", severity="error",
|
|
))
|
|
elapsed = int((time.time() - t0) * 1000)
|
|
return ValidationRunResult(
|
|
name="json-syntax", description="Validate JSON file syntax",
|
|
files_checked=len(json_files), issues=issues, elapsed_ms=elapsed,
|
|
)
|
|
|
|
def _run_unit_tests(self, root: Path) -> ValidationRunResult:
|
|
"""Run physics engine unit conversion tests."""
|
|
# validate.py is at book/cli/commands/validate.py
|
|
# test_units.py is at book/quarto/mlsys/test_units.py
|
|
book_dir = Path(__file__).resolve().parent.parent.parent # book/
|
|
script = book_dir / "quarto" / "mlsys" / "test_units.py"
|
|
return self._delegate_script(script, [], "unit-tests")
|