""" Native validation commands for MLSysBook Binder CLI. Validation logic is implemented in Binder where possible (e.g. references, citations, labels, figures, rendering). Some checks still delegate to scripts under book/tools/scripts/ (tables, spelling, epub, sources, grid-tables, images). See book/cli/BINDER_NATIVE_AUDIT.md for the full list. """ from __future__ import annotations import argparse import json import os import re import time from collections import defaultdict from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Set, Tuple from rich.console import Console from rich.panel import Panel from rich.table import Table from . import reference_check console = Console() @dataclass class ValidationIssue: file: str line: int code: str message: str severity: str = "error" context: str = "" def to_dict(self) -> Dict[str, Any]: return { "file": self.file, "line": self.line, "code": self.code, "message": self.message, "severity": self.severity, "context": self.context, } @dataclass class ValidationRunResult: name: str description: str files_checked: int issues: List[ValidationIssue] elapsed_ms: int @property def passed(self) -> bool: return not any(i.severity == "error" for i in self.issues) def to_dict(self) -> Dict[str, Any]: return { "name": self.name, "description": self.description, "files_checked": self.files_checked, "passed": self.passed, "issue_count": len(self.issues), "elapsed_ms": self.elapsed_ms, "issues": [issue.to_dict() for issue in self.issues], } INLINE_REF_PATTERN = re.compile(r"`\{python\}\s+(\w+(?:\.\w+)?)`") CELL_START_PATTERN = re.compile(r"^```\{python\}|^```python") CELL_END_PATTERN = re.compile(r"^```\s*$") ASSIGN_PATTERN = re.compile(r"^([A-Za-z_]\w*)\s*=") # Tuple unpacking: "a, b = ..." — captures all names on the left side TUPLE_ASSIGN_PATTERN = re.compile(r"^((?:[A-Za-z_]\w*\s*,\s*)+[A-Za-z_]\w*)\s*=") CLASS_DEF_PATTERN = re.compile(r"^class\s+(\w+)\s*[:(]") GRID_TABLE_SEP_PATTERN = re.compile(r"^\+[-:=+]+\+$") LATEX_INLINE_PATTERN = re.compile(r"(? bool: all_group_names = list(self.GROUPS.keys()) + ["all"] parser = argparse.ArgumentParser( prog="binder check", description="Run quality checks on book content", add_help=True, ) parser.add_argument( "subcommand", nargs="?", choices=all_group_names, help="Check group to run (refs, labels, headers, footnotes, figures, rendering, references, content, all)", ) parser.add_argument("--scope", default=None, help="Narrow to a specific check within a group") parser.add_argument("--path", default=None, help="File or directory path to check") parser.add_argument("--vol1", action="store_true", help="Scope to Volume I") parser.add_argument("--vol2", action="store_true", help="Scope to Volume II") parser.add_argument("--json", action="store_true", help="Emit machine-readable JSON output") parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") parser.add_argument("--citations-in-code", action="store_true", help="refs: check citations in code fences") parser.add_argument("--citations-in-raw", action="store_true", help="refs: check citations in raw blocks") parser.add_argument("--check-patterns", action="store_true", default=True, help="refs --scope inline: include pattern hazard checks (default: on)") parser.add_argument("--no-check-patterns", action="store_false", dest="check_patterns", help="refs --scope inline: skip pattern hazard checks") parser.add_argument("--check-scope", action="store_true", default=False, help="refs --scope inline: detect bare variable refs in class bodies that need ClassName.attr") parser.add_argument("--no-check-scope", action="store_false", dest="check_scope", help="refs --scope inline: skip scope analysis") parser.add_argument("--figures", action="store_true", help="labels: filter to figures") parser.add_argument("--tables", action="store_true", help="labels: filter to tables") parser.add_argument("--sections", action="store_true", help="labels: filter to sections") parser.add_argument("--equations", action="store_true", help="labels: filter to equations") parser.add_argument("--listings", action="store_true", help="labels: filter to listings") parser.add_argument("--all-types", action="store_true", help="labels: all label types") parser.add_argument("-f", "--file", dest="refs_file", action="append", metavar="BIB", help="references: .bib file(s) to check") parser.add_argument("-o", "--output", dest="refs_output", metavar="FILE", help="references: write report to FILE") parser.add_argument("--limit", type=int, dest="refs_limit", metavar="N", help="references: check only first N refs (quick test)") parser.add_argument("--skip-verified", dest="refs_skip_verified", action="store_true", help="references: skip refs already verified in cache") parser.add_argument("--thorough", dest="refs_thorough", action="store_true", help="references: revalidate all refs (ignore cache)") parser.add_argument("--refs-cache", dest="refs_cache", metavar="FILE", help="references: cache file (default: .references_verified.json in repo root)") parser.add_argument("--only-from-report", dest="refs_only_from_report", metavar="FILE", help="references: validate only keys that had issues in this report file") parser.add_argument("--only-keys", dest="refs_only_keys_file", metavar="FILE", help="references: validate only keys listed in FILE (one key per line)") try: ns = parser.parse_args(args) except SystemExit: # argparse uses SystemExit(0) for --help and non-zero for parse errors. return ("-h" in args) or ("--help" in args) if not ns.subcommand: self._print_check_help() return False root_path = self._resolve_path(ns.path, ns.vol1, ns.vol2) if not root_path.exists(): self._emit(ns.json, {"status": "error", "message": f"Path not found: {root_path}"}, failed=True) return False runs: List[ValidationRunResult] = [] if ns.subcommand == "all": for group_name in self.GROUPS: runs.extend(self._run_group(group_name, None, root_path, ns)) else: group_name = ns.subcommand scope = ns.scope if scope and not any(s == scope for s, _ in self.GROUPS.get(group_name, [])): valid = [s for s, _ in self.GROUPS[group_name]] console.print(f"[red]Unknown scope '{scope}' for group '{group_name}'.[/red]") console.print(f"[yellow]Valid scopes: {', '.join(valid)}[/yellow]") return False runs.extend(self._run_group(group_name, scope, root_path, ns)) any_failed = any(not run.passed for run in runs) summary = { "status": "failed" if any_failed else "passed", "command": ns.subcommand, "path": str(root_path), "runs": [run.to_dict() for run in runs], "total_issues": sum(len(run.issues) for run in runs), } if ns.json: print(json.dumps(summary, indent=2)) else: self._print_human_summary(summary, verbose=ns.verbose) return not any_failed # ------------------------------------------------------------------ # Group dispatch # ------------------------------------------------------------------ def _run_group( self, group: str, scope: Optional[str], root: Path, ns: argparse.Namespace, ) -> List[ValidationRunResult]: """Run all checks in *group*, or just the one matching *scope*.""" results: List[ValidationRunResult] = [] for scope_name, method_name in self.GROUPS[group]: if scope and scope != scope_name: continue method = getattr(self, method_name) # Some runners need extra kwargs if method_name == "_run_refs": checks_code = ns.citations_in_code or (not ns.citations_in_code and not ns.citations_in_raw) checks_raw = ns.citations_in_raw or (not ns.citations_in_code and not ns.citations_in_raw) results.append(method(root, citations_in_code=checks_code, citations_in_raw=checks_raw)) elif method_name == "_run_inline_refs": results.append(method(root, check_patterns=ns.check_patterns, check_scope=getattr(ns, 'check_scope', False))) elif method_name in ("_run_duplicate_labels", "_run_unreferenced_labels"): results.append(method(root, self._selected_label_types(ns))) elif method_name == "_run_check_references": results.append(method(root, ns)) else: results.append(method(root)) return results def _print_check_help(self) -> None: """Print a nicely formatted help for the check command.""" table = Table(show_header=True, header_style="bold cyan", box=None) table.add_column("Group", style="cyan", width=14) table.add_column("Scopes", style="yellow", width=38) table.add_column("Description", style="white", width=32) descriptions = { "refs": "References, citations, inline Python, self-ref", "labels": "Duplicate labels, orphans, fig-label underscores", "headers": "Section header IDs ({#sec-...})", "footnotes": "Placement, integrity, cross-chapter duplicates", "figures": "Captions, float flow, image files", "rendering": "Patterns, indexes, dropcaps, headings, typos, tables, ASCII", "images": "Image file formats, external URLs", "json": "JSON file syntax validation", "units": "Physics engine unit conversion tests", "spelling": "Prose and TikZ spell checking (requires aspell)", "epub": "EPUB file validation", "sources": "Source citation analysis and validation", "references": "Bibliography vs academic DBs (hallucinator)", "content": "Content tree (shared/, frontmatter/ required)", } for group_name, checks in self.GROUPS.items(): scopes = ", ".join(s for s, _ in checks) desc = descriptions.get(group_name, "") table.add_row(group_name, scopes, desc) table.add_row("all", "(everything)", "Run all checks") console.print(Panel(table, title="binder check [--scope ]", border_style="cyan")) console.print("[dim]Examples:[/dim]") console.print(" [cyan]./binder check refs[/cyan] [dim]# all reference checks[/dim]") console.print(" [cyan]./binder check refs --scope citations[/cyan] [dim]# only citation check[/dim]") console.print(" [cyan]./binder check figures --vol1[/cyan] [dim]# all figure checks, Vol I[/dim]") console.print(" [cyan]./binder check all[/cyan] [dim]# everything[/dim]") console.print() # ------------------------------------------------------------------ def _resolve_path(self, path_arg: Optional[str], vol1: bool, vol2: bool) -> Path: if path_arg: path = Path(path_arg) if not path.is_absolute(): path = (Path.cwd() / path).resolve() return path base = self.config_manager.book_dir / "contents" if vol1 and not vol2: return base / "vol1" if vol2 and not vol1: return base / "vol2" return base def _selected_label_types(self, ns: argparse.Namespace) -> Dict[str, List[re.Pattern[str]]]: explicit = ns.figures or ns.tables or ns.sections or ns.equations or ns.listings if ns.all_types: return LABEL_DEF_PATTERNS if explicit: selected: Dict[str, List[re.Pattern[str]]] = {} if ns.figures: selected["Figure"] = LABEL_DEF_PATTERNS["Figure"] if ns.tables: selected["Table"] = LABEL_DEF_PATTERNS["Table"] if ns.sections: selected["Section"] = LABEL_DEF_PATTERNS["Section"] if ns.equations: selected["Equation"] = LABEL_DEF_PATTERNS["Equation"] if ns.listings: selected["Listing"] = LABEL_DEF_PATTERNS["Listing"] return selected # default: all label types return LABEL_DEF_PATTERNS def _qmd_files(self, root: Path) -> List[Path]: if root.is_file(): return [root] if root.suffix == ".qmd" else [] return sorted(root.rglob("*.qmd")) def _read_text(self, path: Path) -> str: try: return path.read_text(encoding="utf-8") except UnicodeDecodeError: return path.read_text(encoding="utf-8", errors="ignore") def _relative_file(self, path: Path) -> str: try: return str(path.relative_to(self.config_manager.book_dir)) except ValueError: return str(path) def _run_python_syntax(self, root: Path) -> ValidationRunResult: """Compile every ```{python} code block to catch syntax errors.""" start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] block_start_re = re.compile(r"^```\{python\}") block_end_re = re.compile(r"^```\s*$") for file in files: content = self._read_text(file) lines = content.split("\n") rel = str(file.relative_to(root)) if file.is_relative_to(root) else str(file) in_block = False block_lines: List[str] = [] block_start_line = 0 for i, line in enumerate(lines, start=1): if block_start_re.match(line): in_block = True block_lines = [] block_start_line = i continue if in_block and block_end_re.match(line): in_block = False # Skip YAML-style #| directives before compiling source_lines = [ ln for ln in block_lines if not ln.strip().startswith("#|") ] source = "\n".join(source_lines) if not source.strip(): continue try: compile(source, f"{rel}:{block_start_line}", "exec") except SyntaxError as exc: err_line = block_start_line + (exc.lineno or 1) issues.append(ValidationIssue( file=rel, line=err_line, code="python_syntax", message=f"Python syntax error: {exc.msg}", severity="error", context=(exc.text or "").strip()[:120], )) continue if in_block: block_lines.append(line) return ValidationRunResult( name="python-syntax", description="Validate Python code block syntax (compile check)", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) def _run_inline_python(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] regex_checks = [ ("missing_backtick", re.compile(r"(? ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] fenced_code_pattern = re.compile(r"```\{([^}]+)\}(.*?)```", re.DOTALL) raw_block_pattern = re.compile(r"```\{=(html|latex|tex)\}(.*?)```", re.DOTALL | re.IGNORECASE) problematic_classes = {"tikz", "latex", "tex"} for file in files: content = self._read_text(file) if citations_in_code: for match in fenced_code_pattern.finditer(content): attrs = match.group(1) code_content = match.group(2) class_match = re.search(r"\.([A-Za-z][A-Za-z0-9_-]*)", attrs) cls = class_match.group(1).lower() if class_match else "unknown" if cls not in problematic_classes: continue for cite_match in CITATION_BRACKET_PATTERN.finditer(code_content): offset = match.start() + len(f"```{{{attrs}}}") + cite_match.start() line_no = content[:offset].count("\n") + 1 line = content.splitlines()[line_no - 1] if line_no - 1 < len(content.splitlines()) else "" issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="citation_in_code", message=f"Citation in .{cls} code block will not be processed", severity="error", context=line.strip()[:160], )) if citations_in_raw: for match in raw_block_pattern.finditer(content): raw_type = match.group(1).lower() block = match.group(2) for cite_match in CITATION_BRACKET_PATTERN.finditer(block): offset = match.start() + cite_match.start() line_no = content[:offset].count("\n") + 1 line = content.splitlines()[line_no - 1] if line_no - 1 < len(content.splitlines()) else "" issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="citation_in_raw", message=f"Citation in raw {raw_type} block will not be processed", severity="error", context=line.strip()[:160], )) return ValidationRunResult( name="refs", description="Validate citation/reference placement in raw/code blocks", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) def _bibliography_for_qmd(self, file: Path) -> Optional[Path]: """Resolve the volume backmatter references.bib for a .qmd from its path.""" try: rel = file.relative_to(self.config_manager.book_dir) except ValueError: return None parts = rel.parts if "vol1" in parts: bib_file = self.config_manager.book_dir / "contents" / "vol1" / "backmatter" / "references.bib" elif "vol2" in parts: bib_file = self.config_manager.book_dir / "contents" / "vol2" / "backmatter" / "references.bib" else: return None return bib_file if bib_file.exists() else None def _run_citations(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] bib_key_pattern = re.compile(r"@\w+\{([^,\s]+)") for file in files: bib_file = self._bibliography_for_qmd(file) if bib_file is None: continue content = self._read_text(file) bib_content = self._read_text(bib_file) bib_keys = set(bib_key_pattern.findall(bib_content)) # Strip YAML frontmatter (--- ... --- at file top) to avoid email false positives qmd_content_no_code = re.sub(r"^---\n.*?\n---\n", "", content, flags=re.DOTALL) # Strip HTML style/script blocks to avoid CSS @media false positives qmd_content_no_code = re.sub(r"]*>.*?", "", qmd_content_no_code, flags=re.DOTALL) qmd_content_no_code = re.sub(r"```.*?```", "", qmd_content_no_code, flags=re.DOTALL) qmd_content_no_code = re.sub(r"`[^`]+`", "", qmd_content_no_code) refs = set(CITATION_REF_PATTERN.findall(qmd_content_no_code)) refs = {r.rstrip(".,;:") for r in refs if not r.startswith(EXCLUDED_CITATION_PREFIXES)} refs = {r for r in refs if not re.match(r"^\d+\.\d+", r)} missing = sorted(refs - bib_keys) for key in missing: line_no = self._line_for_token(content, f"@{key}") issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="missing_citation", message=f"Citation key @{key} missing in bibliography", severity="error", context=f"@{key}", )) return ValidationRunResult( name="citations", description="Validate citation keys against bibliography files", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) def _run_duplicate_labels(self, root: Path, label_types: Dict[str, List[re.Pattern[str]]]) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] definitions: Dict[str, List[Tuple[Path, int, str]]] = {} for file in files: lines = self._read_text(file).splitlines() in_code = False for idx, line in enumerate(lines, 1): stripped = line.strip() if stripped.startswith("```"): in_code = not in_code continue if in_code: continue for label_type, patterns in label_types.items(): for pattern in patterns: for match in pattern.finditer(line): label = match.group(1) definitions.setdefault(label, []).append((file, idx, label_type)) for label, locations in definitions.items(): if len(locations) <= 1: continue for file, line_no, label_type in locations: issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="duplicate_label", message=f"Duplicate {label_type.lower()} label: {label}", severity="error", context=label, )) return ValidationRunResult( name="duplicate-labels", description="Detect duplicate label definitions", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) def _run_unreferenced_labels(self, root: Path, label_types: Dict[str, List[re.Pattern[str]]]) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] defined: Dict[str, Tuple[Path, int, str]] = {} references: Dict[str, List[Tuple[Path, int]]] = {} for file in files: lines = self._read_text(file).splitlines() for idx, line in enumerate(lines, 1): for label_type, patterns in label_types.items(): for pattern in patterns: for match in pattern.finditer(line): defined.setdefault(match.group(1), (file, idx, label_type)) for match in LABEL_REF_PATTERN.finditer(line): label = match.group(1) references.setdefault(label, []).append((file, idx)) # unreferenced definitions (skip section defaults, consistent with legacy behavior) for label, (file, line_no, label_type) in defined.items(): if label_type == "Section": continue if label not in references: issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="unreferenced_label", message=f"{label_type} label {label} is never referenced", severity="warning", context=label, )) # unresolved references defined_labels = set(defined.keys()) for label, locations in references.items(): if label in defined_labels: continue for file, line_no in locations: issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="unresolved_reference", message=f"Reference @{label} has no matching label definition", severity="error", context=f"@{label}", )) return ValidationRunResult( name="unreferenced-labels", description="Detect unreferenced labels and unresolved references", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) def _run_inline_refs(self, root: Path, check_patterns: bool, check_scope: bool = False) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] yaml_option_inline = re.compile(r"^#\|\s*(fig-cap|tbl-cap|lst-cap|fig-alt):\s*.*`\{python\}") caption_syntax_inline = re.compile(r"^:\s+.*`\{python\}.*\{#(tbl|fig|lst)-") inline_fstring = re.compile(r"`\{python\}\s*f\"[^`]+`") inline_func_call = re.compile(r"`\{python\}\s*\w+\([^`]+\)`") for file in files: lines = self._read_text(file).splitlines() refs: List[Tuple[int, str]] = [] compute_vars: Set[str] = set() compute_classes: Set[str] = set() in_cell = False for idx, line in enumerate(lines, 1): if CELL_START_PATTERN.match(line.strip()): in_cell = True continue if in_cell and CELL_END_PATTERN.match(line.strip()): in_cell = False continue if in_cell: cls_match = CLASS_DEF_PATTERN.match(line.strip()) if cls_match: compute_classes.add(cls_match.group(1)) assign = ASSIGN_PATTERN.match(line.strip()) if assign: compute_vars.add(assign.group(1)) tuple_assign = TUPLE_ASSIGN_PATTERN.match(line.strip()) if tuple_assign: for name in re.split(r'\s*,\s*', tuple_assign.group(1)): compute_vars.add(name.strip()) for match in INLINE_REF_PATTERN.finditer(line): refs.append((idx, match.group(1))) for line_no, ref in refs: if "." in ref: cls_name = ref.split(".", 1)[0] resolved = cls_name in compute_classes or cls_name in compute_vars else: resolved = ref in compute_vars if not resolved: issues.append(ValidationIssue( file=self._relative_file(file), line=line_no, code="undefined_inline_ref", message=f"Inline reference `{ref}` is not defined in python cells", severity="error", context=f"`{{python}} {ref}`", )) if check_patterns: in_grid = False for idx, line in enumerate(lines, 1): stripped = line.strip() if LATEX_INLINE_PATTERN.search(line): issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="latex_math_inline_python", message="Inline Python inside LaTeX math can strip decimals", severity="warning", context=stripped[:160], )) if LATEX_ADJACENT_PATTERN.search(line): issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="latex_adjacent_inline_python", message="Inline Python adjacent to LaTeX operator is fragile", severity="warning", context=stripped[:160], )) if GRID_TABLE_SEP_PATTERN.match(stripped): in_grid = True elif in_grid and stripped and not stripped.startswith("|"): in_grid = False if in_grid and "`{python}" in line: issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="grid_table_inline_python", message="Inline Python in grid tables is unsupported", severity="error", context=stripped[:160], )) if inline_fstring.search(line): issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="inline_fstring", message="Inline f-string should be precomputed in Python cell", severity="warning", context=stripped[:160], )) if inline_func_call.search(line): issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="inline_function_call", message="Inline function call should be precomputed in Python cell", severity="warning", context=stripped[:160], )) if yaml_option_inline.search(line): issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="yaml_option_inline_python", message="Inline Python in YAML fig/tbl/lst metadata will not render", severity="error", context=stripped[:160], )) if caption_syntax_inline.search(line): issues.append(ValidationIssue( file=self._relative_file(file), line=idx, code="caption_inline_python", message="Inline Python in caption syntax will not render", severity="error", context=stripped[:160], )) if check_scope: from book.quarto.mlsys.validate_inline_refs import check_scope as _check_scope, BOOK_ROOT try: scope_warnings = _check_scope(file, verbose=False) for filepath, lineno, check_type, msg in scope_warnings: issues.append(ValidationIssue( file=self._relative_file(file), line=lineno, code=check_type.lower(), message=msg, severity="warning", context="", )) except Exception: pass return ValidationRunResult( name="inline-refs", description="Validate inline Python refs and rendering hazard patterns", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Headers (ported from manage_section_ids.py --verify) # ------------------------------------------------------------------ def _run_headers(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] header_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s*\{[^}]*\})?$") div_start_pat = re.compile(r"^:::\s*\{\.") div_end_pat = re.compile(r"^:::\s*$") code_block_pat = re.compile(r"^```[^`]*$") sec_id_pat = re.compile(r"\{#sec-[^}]+\}") for file in files: lines = self._read_text(file).splitlines() in_code = False in_div = False for idx, line in enumerate(lines, 1): stripped = line.strip() if code_block_pat.match(stripped): in_code = not in_code continue if in_code: continue if div_start_pat.match(stripped): in_div = True continue if div_end_pat.match(stripped): in_div = False continue if in_div: continue match = header_pat.match(line) if not match: continue # Extract existing attributes existing_attrs = "" if "{" in line: attrs_start = line.find("{") attrs_end = line.rfind("}") if attrs_end > attrs_start: existing_attrs = line[attrs_start : attrs_end + 1] if ".unnumbered" in existing_attrs: continue if not sec_id_pat.search(line): title = match.group(2).strip() issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="missing_section_id", message=f"Header missing section ID: {title}", severity="error", context=line.strip()[:160], ) ) return ValidationRunResult( name="headers", description="Verify section headers have {#sec-...} IDs", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Footnote Placement (ported from check_forbidden_footnotes.py) # ------------------------------------------------------------------ def _run_footnote_placement(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] fn_pat = re.compile(r"\[\^fn-[\w-]+\]") inline_fn_pat = re.compile(r"\^\[[^\]]+\]") table_sep_pat = re.compile(r"^\|[\s\-:+]+\|") for file in files: lines = self._read_text(file).splitlines() div_depth = 0 div_start_line = 0 for idx, line in enumerate(lines, 1): stripped = line.strip() # Track div nesting if re.match(r"^:{3,4}\s*\{", stripped) or re.match(r"^:{3,4}\s+\w", stripped): div_depth += 1 if div_depth == 1: div_start_line = idx elif re.match(r"^:{3,4}\s*$", stripped): if div_depth > 0: div_depth -= 1 if div_depth == 0: div_start_line = 0 # Check inline footnotes (always forbidden) for m in inline_fn_pat.finditer(line): issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="inline_footnote", message=f"Inline footnote syntax; use [^fn-name] reference format", severity="error", context=m.group(0)[:80], ) ) footnotes = fn_pat.findall(line) if not footnotes: continue # Table cell check if stripped.startswith("|") and stripped.count("|") >= 2 and not table_sep_pat.match(stripped): for fn in footnotes: issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="footnote_in_table", message=f"Footnote {fn} in table cell", severity="error", context=stripped[:80], ) ) # YAML caption check if re.match(r"^\s*(fig-cap|tbl-cap):", line): cap_type = "figure" if "fig-cap" in line else "table" for fn in footnotes: issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code=f"footnote_in_{cap_type}_caption", message=f"Footnote {fn} in {cap_type} caption", severity="error", context=stripped[:80], ) ) # Markdown caption check if re.match(r"^:\s*\*\*[^*]+\*\*:", line): for fn in footnotes: issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="footnote_in_markdown_caption", message=f"Footnote {fn} in markdown caption", severity="error", context=stripped[:80], ) ) # Callout title check if re.match(r"^:{3,4}\s*\{.*title=", stripped): title_match = re.search(r'title="([^"]*)"', line) if title_match and fn_pat.search(title_match.group(1)): for fn in fn_pat.findall(title_match.group(1)): issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="footnote_in_callout_title", message=f"Footnote {fn} in callout title (breaks LaTeX)", severity="error", context=stripped[:80], ) ) # Div block check if div_depth > 0 and div_start_line != idx: for fn in footnotes: issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="footnote_in_div", message=f"Footnote {fn} inside div block (started line {div_start_line})", severity="error", context=stripped[:80], ) ) return ValidationRunResult( name="footnote-placement", description="Check footnotes in forbidden locations", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Footnote Refs (ported from footnote_cleanup.py --validate) # ------------------------------------------------------------------ def _run_footnote_refs(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] ref_pat = re.compile(r"\[\^([^]]+)\]") def_pat = re.compile(r"^\[\^([^]]+)\]:\s*(.+)$", re.MULTILINE) for file in files: content = self._read_text(file) lines = content.split("\n") # Collect definitions fn_defs: Dict[str, str] = {} for m in def_pat.finditer(content): fn_defs[m.group(1)] = m.group(2) # Collect references (excluding definition lines themselves) fn_refs: Dict[str, List[int]] = defaultdict(list) for line_num, line in enumerate(lines, 1): for m in ref_pat.finditer(line): fn_id = m.group(1) dm = def_pat.match(line) if dm and dm.group(1) == fn_id: continue # definition line, not a reference fn_refs[fn_id].append(line_num) # Undefined references for fn_id in sorted(set(fn_refs.keys()) - set(fn_defs.keys())): first_line = fn_refs[fn_id][0] issues.append( ValidationIssue( file=self._relative_file(file), line=first_line, code="undefined_footnote_ref", message=f"Undefined footnote reference: [^{fn_id}]", severity="error", context=f"[^{fn_id}]", ) ) # Unused definitions for fn_id in sorted(set(fn_defs.keys()) - set(fn_refs.keys())): def_line = self._line_for_token(content, f"[^{fn_id}]:") issues.append( ValidationIssue( file=self._relative_file(file), line=def_line, code="unused_footnote_def", message=f"Unused footnote definition: [^{fn_id}]", severity="warning", context=f"[^{fn_id}]:", ) ) # Duplicate definitions def_counts: Dict[str, int] = defaultdict(int) for line in lines: dm = re.match(r"^\[\^([^]]+)\]:", line) if dm: def_counts[dm.group(1)] += 1 for fn_id, count in def_counts.items(): if count > 1: issues.append( ValidationIssue( file=self._relative_file(file), line=self._line_for_token(content, f"[^{fn_id}]:"), code="duplicate_footnote_def", message=f"Duplicate footnote definition ({count}x): [^{fn_id}]", severity="error", context=f"[^{fn_id}]:", ) ) # Missing blank line before footnote definition # Pandoc requires footnote definitions to start a new block. # Without a preceding blank line, Pandoc treats the definition # as continuation text and renders [^fn-name] as literal text. fn_def_line_pat = re.compile(r"^\[\^[^\]]+\]:") for idx, line in enumerate(lines): if fn_def_line_pat.match(line) and idx > 0: prev = lines[idx - 1] if prev.strip(): # previous line is not blank fn_match = re.match(r"^\[\^([^\]]+)\]:", line) fn_id_str = fn_match.group(1) if fn_match else "?" issues.append( ValidationIssue( file=self._relative_file(file), line=idx + 1, code="footnote_missing_blank_line", message=( f"Footnote definition [^{fn_id_str}] has no blank line before it — " f"Pandoc will not parse it as a footnote" ), severity="error", context=f"prev: {prev.strip()[:60]}", ) ) return ValidationRunResult( name="footnote-refs", description="Validate footnote references and definitions", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Figures (ported from check_figure_completeness.py) # ------------------------------------------------------------------ def _run_figures(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] fig_id_pat = re.compile(r"\{#(fig-[a-zA-Z0-9_-]+)[\s}]") md_cap_pat = re.compile(r"!\[(.+?)\]\(") for file in files: lines = self._read_text(file).splitlines() seen_ids: Set[str] = set() # Pass 1: attribute-based figures for idx, line in enumerate(lines, 1): m = fig_id_pat.search(line) if not m: continue fig_id = m.group(1) has_cap = bool(re.search(r'fig-cap="[^"]+', line)) has_alt = bool(re.search(r'fig-alt="[^"]+', line)) if "![" in line: md_m = md_cap_pat.search(line) if md_m and md_m.group(1).strip(): has_cap = True seen_ids.add(fig_id) missing = [] if not has_cap: missing.append("caption") if not has_alt: missing.append("alt-text") if missing: issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code="incomplete_figure", message=f"Figure {fig_id} missing: {', '.join(missing)}", severity="error", context=line.strip()[:120], ) ) # Pass 2: code-cell figures in_code = False code_start = 0 cell_opts: Dict[str, str] = {} for idx, line in enumerate(lines, 1): stripped = line.rstrip() if not in_code and re.match(r"^```\{(?:python|r|julia|ojs)", stripped): in_code = True code_start = idx cell_opts = {} continue if in_code and stripped == "```": label = cell_opts.get("label", "") if label.startswith("fig-") and label not in seen_ids: cap_val = cell_opts.get("fig-cap", "") alt_val = cell_opts.get("fig-alt", "") missing = [] if not cap_val: missing.append("caption") if not alt_val: missing.append("alt-text") if missing: issues.append( ValidationIssue( file=self._relative_file(file), line=code_start, code="incomplete_figure", message=f"Figure {label} missing: {', '.join(missing)}", severity="error", context=f"code-cell figure {label}", ) ) seen_ids.add(label) in_code = False cell_opts = {} continue if in_code: opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped) if opt_m: val = opt_m.group(2).strip().strip("\"'") cell_opts[opt_m.group(1)] = val return ValidationRunResult( name="figures", description="Check figures have captions and alt-text", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Float Flow (ported from figure_table_flow_audit.py) # ------------------------------------------------------------------ def _run_float_flow(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] div_def_pat = re.compile(r":::\s*\{[^}]*#((?:fig|tbl)-[\w-]+)") img_def_pat = re.compile(r"!\[.*?\]\(.*?\)\s*\{[^}]*#((?:fig|tbl)-[\w-]+)") tbl_cap_pat = re.compile(r"^:\s+.*\{[^}]*#((?:fig|tbl)-[\w-]+)") ref_pat = re.compile(r"@((?:fig|tbl)-[\w-]+)") for file in files: lines = self._read_text(file).splitlines() defs: Dict[str, int] = {} refs: Dict[str, List[int]] = defaultdict(list) in_code = False in_float = False float_label: Optional[str] = None code_spans: List[Tuple[int, int]] = [] code_start = 0 cell_opts: Dict[str, str] = {} for idx, line in enumerate(lines, 1): stripped = line.rstrip() # Code block tracking if not in_code and re.match(r"^```\{", stripped): in_code = True code_start = idx cell_opts = {} continue if in_code and stripped == "```": code_spans.append((code_start, idx)) label = cell_opts.get("label", "") if label.startswith(("fig-", "tbl-")) and label not in defs: defs[label] = code_start in_code = False cell_opts = {} continue if in_code: opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped) if opt_m: cell_opts[opt_m.group(1)] = opt_m.group(2).strip().strip("\"'") continue # Attribute-based definitions for pat in [div_def_pat, img_def_pat, tbl_cap_pat]: m = pat.search(line) if m: label = m.group(1) if label not in defs: defs[label] = idx if pat == div_def_pat: in_float = True float_label = label # Track float block end if in_float: ls = line.strip() if ls.startswith(":::") and not ls.startswith("::: {"): in_float = False float_label = None # References if "fig-cap=" in line or "fig-alt=" in line: continue for m in ref_pat.finditer(line): label = m.group(1) if in_float and label == float_label: continue refs[label].append(idx) # Evaluate status all_labels = set(defs.keys()) | set(refs.keys()) for label in sorted(all_labels): def_line = defs.get(label) ref_lines = refs.get(label, []) first_ref = min(ref_lines) if ref_lines else None if not def_line: continue # XREF — informational, skip if not first_ref: issues.append( ValidationIssue( file=self._relative_file(file), line=def_line, code="orphan_float", message=f"{'Figure' if label.startswith('fig-') else 'Table'} {label} defined but never referenced", severity="warning", context=label, ) ) continue # Compute prose gap gap = def_line - first_ref code_lines = 0 if gap > 0: for cs, ce in code_spans: os_ = max(first_ref, cs) oe_ = min(def_line, ce) if os_ <= oe_: code_lines += oe_ - os_ + 1 prose_gap = gap - code_lines if prose_gap > 30: # Check closest reference closest = min(ref_lines, key=lambda r: abs(def_line - r)) closest_gap = def_line - closest if -5 <= closest_gap <= 30: continue # OK issues.append( ValidationIssue( file=self._relative_file(file), line=def_line, code="late_float", message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (too far after mention)", severity="warning", context=label, ) ) elif prose_gap < -5: issues.append( ValidationIssue( file=self._relative_file(file), line=def_line, code="early_float", message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (appears before mention)", severity="warning", context=label, ) ) return ValidationRunResult( name="float-flow", description="Audit figure/table placement relative to first reference", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Indexes (ported from check_index_placement.py) # ------------------------------------------------------------------ def _run_indexes(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] checks = [ ("index_on_heading", re.compile(r"^#{1,6}\s+.*\\index\{"), "\\index{} on same line as heading"), ("index_before_div", re.compile(r"\\index\{[^}]*\}:::"), "\\index{} directly before ::: (div/callout)"), ("index_after_div", re.compile(r"^::+\s+\{[^}]*\}\s*\\index\{"), "\\index{} on same line as div/callout"), ("index_before_footnote", re.compile(r"^\\index\{[^}]*\}.*\[\^[^\]]+\]:"), "\\index{} before footnote definition"), ] for file in files: lines = self._read_text(file).splitlines() in_code = False for idx, line in enumerate(lines, 1): if line.strip().startswith("```"): in_code = not in_code continue if in_code: continue for code, pattern, message in checks: # Skip fig-cap lines for index_after_div if code == "index_after_div" and "fig-cap=" in line: continue if pattern.search(line): issues.append( ValidationIssue( file=self._relative_file(file), line=idx, code=code, message=message, severity="error", context=line.strip()[:120], ) ) return ValidationRunResult( name="indexes", description="Check LaTeX \\index{} placement", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Rendering (ported from check_render_patterns.py) # ------------------------------------------------------------------ def _run_rendering(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] regex_checks = [ ("missing_opening_backtick", re.compile(r"(? ValidationRunResult: """Ensure every ```{python} block has #| echo: false (code must not appear in output).""" start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] block_start_re = re.compile(r"^```\{python\}") block_end_re = re.compile(r"^```\s*$") # Quarto chunk option: #| echo: false (with optional whitespace) echo_false_re = re.compile(r"#\|\s*echo\s*:\s*false", re.IGNORECASE) for file in files: lines = self._read_text(file).splitlines() i = 0 while i < len(lines): line = lines[i] if not block_start_re.match(line): i += 1 continue start_line = i + 1 found_echo_false = False j = i + 1 # Scan option lines: #| key: value, or blank, until we hit code or closing ``` while j < len(lines): next_line = lines[j] if block_end_re.match(next_line): break stripped = next_line.strip() if echo_false_re.search(stripped): found_echo_false = True break # Option line or blank — keep scanning if stripped.startswith("#|") or not stripped: j += 1 continue # Non-option line (actual code or comment) — options are done break if not found_echo_false: issues.append( ValidationIssue( file=self._relative_file(file), line=start_line, code="python_missing_echo_false", message="Python block must include #| echo: false — code must not appear in rendered output", severity="error", context="Add #| echo: false as first line after ```{python}", ) ) # Advance past this block to the line after closing ``` k = j while k < len(lines) and not block_end_re.match(lines[k]): k += 1 i = k + 1 return ValidationRunResult( name="python-echo", description="Check Python blocks have echo: false", files_checked=len(files), issues=issues, elapsed_ms=int((time.time() - start) * 1000), ) # ------------------------------------------------------------------ # Dropcaps (ported from validate_dropcap_compat.py) # ------------------------------------------------------------------ def _run_dropcaps(self, root: Path) -> ValidationRunResult: start = time.time() files = self._qmd_files(root) issues: List[ValidationIssue] = [] chapter_hdr = re.compile(r"^#\s+[^#].*\{#sec-") numbered_h2 = re.compile(r"^##\s+[^#]") unnumbered_h2 = re.compile(r"^##\s+.*\{.*\.unnumbered.*\}") starts_xref = re.compile(r"^\s*@(sec|fig|tbl|lst|eq)-") starts_link = re.compile(r"^\s*\[") starts_inline = re.compile(r"^\s*`") yaml_fence = re.compile(r"^---\s*$") code_fence = re.compile(r"^```") div_fence = re.compile(r"^:::") blank = re.compile(r"^\s*$") html_comment = re.compile(r"^\s*