diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4d8ba16b2..89013eac7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -121,8 +121,7 @@ repos: - id: book-verify-section-ids name: "Book: Verify all sections have IDs" # NOTE: Currently only checking Vol1 - Vol2 is still in early development - # Uses language: system because manage_section_ids.py requires nltk - entry: python3 book/tools/scripts/content/manage_section_ids.py -d book/quarto/contents/vol1/ --verify --force + entry: ./book/binder validate section-ids --vol1 language: system pass_filenames: false files: ^book/quarto/contents/vol1/.*\.qmd$ @@ -159,15 +158,15 @@ repos: - id: book-validate-footnotes name: "Book: Validate footnote references" - entry: python book/tools/scripts/content/footnote_cleanup.py -d book/quarto/contents/ --validate - language: python + entry: ./book/binder validate footnotes + language: system pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ - id: book-check-forbidden-footnotes name: "Book: Check for footnotes in tables/captions" - entry: python book/tools/scripts/content/check_forbidden_footnotes.py -d book/quarto/contents/ - language: python + entry: ./book/binder validate forbidden-footnotes + language: system pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ @@ -196,16 +195,16 @@ repos: - id: book-check-figure-completeness name: "Book: Check figures have captions and alt-text" - entry: python book/tools/scripts/content/check_figure_completeness.py -d book/quarto/contents/ --strict --quiet - language: python + entry: ./book/binder validate figure-completeness + language: system pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ - id: book-check-figure-placement name: "Book: Check figure/table placement (near first reference)" - entry: python book/tools/scripts/content/figure_table_flow_audit.py --strict --quiet - language: python - pass_filenames: true + entry: ./book/binder validate figure-placement + language: system + pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ - id: book-check-table-formatting @@ -232,17 +231,17 @@ repos: - id: book-check-render-patterns name: "Book: Check for rendering issues (LaTeX+Python)" - entry: python book/tools/scripts/utilities/check_render_patterns.py - language: python - pass_filenames: true + entry: ./book/binder validate render-patterns + language: system + pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ verbose: true - id: book-validate-dropcap name: "Book: Validate drop cap compatibility" - entry: python book/tools/scripts/content/validate_dropcap_compat.py - language: python - pass_filenames: true + entry: ./book/binder validate dropcap + language: system + pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ - id: book-mlsys-validate-inline @@ -262,16 +261,15 @@ repos: - id: book-check-index-placement name: "Book: Check index placement (not inline with headings/callouts)" - entry: python book/tools/scripts/content/check_index_placement.py - language: python - pass_filenames: true + entry: ./book/binder validate index-placement + language: system + pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ - id: book-validate-part-keys name: "Book: Validate part keys" - entry: python book/tools/scripts/utilities/validate_part_keys.py - language: python - additional_dependencies: [pyyaml] + entry: ./book/binder validate part-keys + language: system pass_filenames: false files: ^book/.*\.qmd$ @@ -296,8 +294,8 @@ repos: - id: book-validate-image-references name: "Book: Check image references exist" - entry: python book/tools/scripts/images/validate_image_references.py -d book/quarto/contents/ --quiet - language: python + entry: ./book/binder validate image-refs + language: system pass_filenames: false files: ^book/quarto/contents/.*\.qmd$ diff --git a/book/cli/commands/maintenance.py b/book/cli/commands/maintenance.py index 97a73f4b6..eb5272f42 100644 --- a/book/cli/commands/maintenance.py +++ b/book/cli/commands/maintenance.py @@ -5,11 +5,13 @@ Handles setup, switch, hello, about, and other maintenance operations. """ import argparse +import hashlib import json import os import re import subprocess import shutil +import time from collections import defaultdict from datetime import datetime from pathlib import Path @@ -194,10 +196,11 @@ class MaintenanceCommand: description="Maintenance namespace for non-build workflows", add_help=True, ) - parser.add_argument("topic", nargs="?", choices=["glossary", "images", "repo-health"]) + parser.add_argument("topic", nargs="?", choices=["glossary", "images", "repo-health", "section-ids", "footnotes"]) parser.add_argument("action", nargs="?") - parser.add_argument("--vol1", action="store_true", help="Scope glossary build to vol1") - parser.add_argument("--vol2", action="store_true", help="Scope glossary build to vol2") + parser.add_argument("--vol1", action="store_true", help="Scope to vol1") + parser.add_argument("--vol2", action="store_true", help="Scope to vol2") + parser.add_argument("--path", default=None, help="File or directory path") parser.add_argument("-f", "--file", action="append", default=[], help="Image file to process (repeatable)") parser.add_argument("--all", action="store_true", help="Process all matching images") parser.add_argument("--apply", action="store_true", help="Apply changes in-place") @@ -206,6 +209,9 @@ class MaintenanceCommand: parser.add_argument("--smart-compression", action="store_true", help="Try quality first, resize only if still too large") parser.add_argument("--min-size-mb", type=int, default=1, help="Minimum size for --all image scan") parser.add_argument("--json", action="store_true", help="Emit JSON output for repo-health") + parser.add_argument("--force", action="store_true", help="Skip interactive confirmations") + parser.add_argument("--dry-run", action="store_true", help="Preview changes without modifying files") + parser.add_argument("--backup", action="store_true", help="Create backup files before changes") try: ns = parser.parse_args(args) @@ -245,8 +251,457 @@ class MaintenanceCommand: return False return self._maintain_repo_health(min_size_mb=ns.min_size_mb, json_output=ns.json) + if ns.topic == "section-ids": + valid_actions = ("add", "repair", "list", "remove") + if ns.action not in valid_actions: + console.print(f"[red]❌ Supported actions: {', '.join(valid_actions)}[/red]") + return False + root = self._resolve_content_path(ns.path, ns.vol1, ns.vol2) + return self._maintain_section_ids( + root=root, + action=ns.action, + force=ns.force, + dry_run=ns.dry_run, + backup=ns.backup, + ) + + if ns.topic == "footnotes": + valid_actions = ("cleanup", "reorganize", "remove") + if ns.action not in valid_actions: + console.print(f"[red]❌ Supported actions: {', '.join(valid_actions)}[/red]") + return False + root = self._resolve_content_path(ns.path, ns.vol1, ns.vol2) + return self._maintain_footnotes( + root=root, + action=ns.action, + dry_run=ns.dry_run, + backup=ns.backup, + ) + return False + def _resolve_content_path(self, path_arg, vol1: bool, vol2: bool) -> Path: + """Resolve content path from args.""" + if path_arg: + p = Path(path_arg) + return p if p.is_absolute() else (Path.cwd() / p).resolve() + base = self.config_manager.book_dir / "contents" + if vol1 and not vol2: + return base / "vol1" + if vol2 and not vol1: + return base / "vol2" + return base + + # ------------------------------------------------------------------ + # Section ID management (ported from manage_section_ids.py) + # ------------------------------------------------------------------ + + @staticmethod + def _simple_slugify(text: str) -> str: + """Convert header text to a slug, removing stopwords.""" + try: + from nltk.corpus import stopwords + stop_words = set(stopwords.words("english")) + except Exception: + stop_words = { + "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", + "for", "of", "with", "by", "from", "is", "it", "as", "be", + "was", "are", "were", "been", "being", "have", "has", "had", + "do", "does", "did", "will", "would", "could", "should", + "may", "might", "shall", "can", "not", "no", "so", "if", + "than", "that", "this", "these", "those", "then", "there", + "what", "which", "who", "whom", "how", "when", "where", "why", + "all", "each", "every", "both", "few", "more", "most", "other", + "some", "such", "only", "own", "same", "too", "very", + } + words = text.lower().split() + filtered = [] + for word in words: + word = re.sub(r"[^\w\s]", "", word) + if word and word not in stop_words: + filtered.append(word) + return "-".join(filtered) + + @staticmethod + def _generate_section_id(title, file_path, chapter_title, parent_sections=None, is_chapter=False): + """Generate a unique section ID.""" + clean_title = MaintenanceCommand._simple_slugify(title) + if is_chapter: + return f"sec-{clean_title}" + clean_chapter = MaintenanceCommand._simple_slugify(chapter_title) + hierarchy = "" + if parent_sections: + hierarchy = "|".join(MaintenanceCommand._simple_slugify(p) for p in parent_sections) + hash_input = f"{file_path}|{chapter_title}|{title}|{hierarchy}".encode("utf-8") + hash_suffix = hashlib.sha1(hash_input).hexdigest()[:4] + return f"sec-{clean_chapter}-{clean_title}-{hash_suffix}" + + def _maintain_section_ids(self, root: Path, action: str, force: bool, dry_run: bool, backup: bool) -> bool: + """Manage section IDs: add, repair, list, remove.""" + header_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s*\{[^}]*\})?$") + div_start = re.compile(r"^:::\s*\{\.") + div_end = re.compile(r"^:::\s*$") + code_pat = re.compile(r"^```[^`]*$") + sec_id_pat = re.compile(r"\{#(sec-[^}]+)\}") + + files = sorted(root.rglob("*.qmd")) if root.is_dir() else ([root] if root.suffix == ".qmd" else []) + if not files: + console.print("[yellow]No .qmd files found.[/yellow]") + return False + + total_added = 0 + total_updated = 0 + total_removed = 0 + total_listed = 0 + id_replacements: dict[str, str] = {} + + for file in files: + lines = file.read_text(encoding="utf-8").splitlines(keepends=True) + in_code = False + in_div = False + modified = False + chapter_title = None + section_hierarchy: list[str] = [] + + # Find chapter title first + tmp_code = False + tmp_div = False + for line in lines: + s = line.strip() + if code_pat.match(s): + tmp_code = not tmp_code + continue + if tmp_code: + continue + if div_start.match(s): + tmp_div = True + continue + if div_end.match(s): + tmp_div = False + continue + if tmp_div: + continue + m = header_pat.match(line) + if m and len(m.group(1)) == 1: + chapter_title = m.group(2).strip() + break + + if not chapter_title and action in ("add", "repair"): + console.print(f"[yellow]⚠️ No chapter title in {file}, skipping[/yellow]") + continue + + if action == "list": + console.print(f"\n[cyan]📋 {file}[/cyan]") + count = 0 + for i, line in enumerate(lines, 1): + s = line.strip() + if code_pat.match(s): + in_code = not in_code + continue + if in_code: + continue + if div_start.match(s): + in_div = True + continue + if div_end.match(s): + in_div = False + continue + if in_div: + continue + m = header_pat.match(line) + if not m: + continue + attrs = "" + if "{" in line: + a_s = line.find("{") + a_e = line.rfind("}") + if a_e > a_s: + attrs = line[a_s:a_e + 1] + if ".unnumbered" in attrs: + continue + count += 1 + sid = sec_id_pat.search(line) + if sid: + console.print(f" {count:3d}. {m.group(2).strip()} → #{sid.group(1)}") + else: + console.print(f" {count:3d}. {m.group(2).strip()} [red](NO ID)[/red]") + total_listed += count + continue + + if backup and not dry_run: + bak = f"{file}.backup.{int(time.time())}" + shutil.copy2(file, bak) + console.print(f"[dim]💾 Backup: {bak}[/dim]") + + for i, line in enumerate(lines): + s = line.strip() + if code_pat.match(s): + in_code = not in_code + continue + if in_code: + continue + if div_start.match(s): + in_div = True + continue + if div_end.match(s): + in_div = False + continue + if in_div: + continue + + m = header_pat.match(line) + if not m: + continue + + hashes, title = m.groups() + level = len(hashes) + + while len(section_hierarchy) >= level: + section_hierarchy.pop() + section_hierarchy.append(title.strip()) + parent_sections = section_hierarchy[:-1] if len(section_hierarchy) > 1 else [] + + attrs = "" + if "{" in line: + a_s = line.find("{") + a_e = line.rfind("}") + if a_e > a_s: + attrs = line[a_s:a_e + 1] + if ".unnumbered" in attrs: + continue + + existing = sec_id_pat.search(line) + + if action == "remove": + if existing: + new_attrs = re.sub(r"#sec-[^}\s]+", "", attrs) + new_attrs = re.sub(r"\s+", " ", new_attrs).strip() + if new_attrs in ("{}", "{ }", ""): + lines[i] = f"{hashes} {title}\n" + else: + lines[i] = f"{hashes} {title} {new_attrs}\n" + modified = True + total_removed += 1 + console.print(f" 🗑️ Removed: {title.strip()}") + + elif action == "add": + if not existing: + is_ch = (level == 1) + new_id = self._generate_section_id(title, str(file), chapter_title, parent_sections, is_ch) + if attrs: + lines[i] = f"{hashes} {title} {attrs} {{#{new_id}}}\n" + else: + lines[i] = f"{hashes} {title} {{#{new_id}}}\n" + modified = True + total_added += 1 + console.print(f" ➕ Added: {title.strip()} → #{new_id}") + + elif action == "repair": + is_ch = (level == 1) + new_id = self._generate_section_id(title, str(file), chapter_title, parent_sections, is_ch) + if existing: + old_id = existing.group(1) + if old_id != new_id: + id_replacements[old_id] = new_id + new_attrs = re.sub(r"#sec-[^}\s]+", f"#{new_id}", attrs) + lines[i] = f"{hashes} {title} {new_attrs}\n" + modified = True + total_updated += 1 + console.print(f" 🔄 {title.strip()}: {old_id} → {new_id}") + else: + if attrs: + lines[i] = f"{hashes} {title} {attrs} {{#{new_id}}}\n" + else: + lines[i] = f"{hashes} {title} {{#{new_id}}}\n" + modified = True + total_added += 1 + console.print(f" ➕ Added: {title.strip()} → #{new_id}") + + if modified and not dry_run: + file.write_text("".join(lines), encoding="utf-8") + console.print(f"[green]✅ Saved: {file}[/green]") + + # Summary + console.print(f"\n[bold]Summary:[/bold]") + if action == "list": + console.print(f" Total sections: {total_listed}") + else: + console.print(f" Added: {total_added} Updated: {total_updated} Removed: {total_removed}") + if dry_run: + console.print("[dim] (dry-run — no files modified)[/dim]") + if id_replacements and action == "repair": + console.print(f" [yellow]{len(id_replacements)} ID replacement(s) collected[/yellow]") + console.print(" [dim]Run cross-reference update separately if needed.[/dim]") + + return True + + # ------------------------------------------------------------------ + # Footnote maintenance (ported from footnote_cleanup.py) + # ------------------------------------------------------------------ + + def _maintain_footnotes(self, root: Path, action: str, dry_run: bool, backup: bool) -> bool: + """Manage footnotes: cleanup, reorganize, remove.""" + ref_pat = re.compile(r"\[\^([^]]+)\]") + def_pat = re.compile(r"^\[\^([^]]+)\]:\s*(.+)$", re.MULTILINE) + + files = sorted(root.rglob("*.qmd")) if root.is_dir() else ([root] if root.suffix == ".qmd" else []) + if not files: + console.print("[yellow]No .qmd files found.[/yellow]") + return False + + total_modified = 0 + total_issues_fixed = 0 + + for file in files: + content = file.read_text(encoding="utf-8") + original = content + + if action == "cleanup": + # Remove undefined refs and unused defs + fn_defs = {m.group(1): m.group(2) for m in def_pat.finditer(content)} + fn_refs: set[str] = set() + lines = content.split("\n") + for line in lines: + for m in ref_pat.finditer(line): + fn_id = m.group(1) + dm = def_pat.match(line) + if dm and dm.group(1) == fn_id: + continue + fn_refs.add(fn_id) + + undefined = fn_refs - set(fn_defs.keys()) + unused = set(fn_defs.keys()) - fn_refs + if not undefined and not unused: + continue + + # Remove undefined refs + for ref_id in undefined: + content = re.sub(rf"\[\^{re.escape(ref_id)}\]", "", content) + total_issues_fixed += 1 + + # Remove unused defs + new_lines = [] + skip = False + for line in content.split("\n"): + dm = re.match(r"^\[\^([^]]+)\]:", line) + if dm and dm.group(1) in unused: + skip = True + total_issues_fixed += 1 + continue + if skip: + if line and (line[0] in (" ", "\t")): + continue + elif not line.strip(): + skip = False + continue + else: + skip = False + new_lines.append(line) + content = "\n".join(new_lines) + + elif action == "remove": + # Remove all footnote refs and defs + fn_defs = {m.group(1) for m in def_pat.finditer(content)} + fn_refs_set: set[str] = set() + for m in ref_pat.finditer(content): + fn_refs_set.add(m.group(1)) + + for ref_id in fn_refs_set: + content = re.sub(rf"\[\^{re.escape(ref_id)}\]", "", content) + + new_lines = [] + skip = False + for line in content.split("\n"): + if re.match(r"^\[\^[^\]]+\]:", line): + skip = True + continue + if skip: + if line and (line[0] in (" ", "\t")): + continue + elif not line.strip(): + skip = False + continue + else: + skip = False + new_lines.append(line) + content = "\n".join(new_lines) + + elif action == "reorganize": + # Move definitions to after their first reference paragraph + fn_defs_map = {} + for m in def_pat.finditer(content): + fn_defs_map[m.group(1)] = m.group(2) + fn_refs_map: dict[str, list[int]] = defaultdict(list) + lines = content.split("\n") + for line_num, line in enumerate(lines): + for m in ref_pat.finditer(line): + fn_id = m.group(1) + dm = def_pat.match(line) + if dm and dm.group(1) == fn_id: + continue + fn_refs_map[fn_id].append(line_num) + + if not fn_defs_map: + continue + + # Remove existing defs + skip_lines: set[int] = set() + for i, line in enumerate(lines): + if def_pat.match(line): + skip_lines.add(i) + + new_lines = [] + processed: set[str] = set() + for i, line in enumerate(lines): + if i in skip_lines: + continue + new_lines.append(line) + + # Check for refs in this line + line_refs = [] + for m in ref_pat.finditer(line): + fn_id = m.group(1) + if fn_id in fn_defs_map and fn_id not in processed: + line_refs.append(fn_id) + + if line_refs: + # Find paragraph end + para_end = i + for j in range(i + 1, len(lines)): + if j in skip_lines: + continue + next_line = lines[j].strip() + if not next_line or next_line.startswith("#") or next_line.startswith(":::") or next_line.startswith("```") or next_line.startswith("|") or def_pat.match(lines[j]): + break + para_end = j + + if i == para_end: + new_lines.append("") + for fn_id in line_refs: + if fn_id in fn_defs_map: + new_lines.append(f"[^{fn_id}]: {fn_defs_map[fn_id]}") + processed.add(fn_id) + + content = "\n".join(new_lines) + + if content != original: + total_modified += 1 + if backup and not dry_run: + bak = file.with_suffix(file.suffix + ".bak") + shutil.copy2(file, bak) + if not dry_run: + file.write_text(content, encoding="utf-8") + console.print(f"[green]✅ {action}: {file}[/green]") + else: + console.print(f"[dim]⏭️ No changes: {file}[/dim]") + + console.print(f"\n[bold]Summary:[/bold] {total_modified} file(s) modified") + if action == "cleanup": + console.print(f" Issues fixed: {total_issues_fixed}") + if dry_run: + console.print("[dim] (dry-run — no files modified)[/dim]") + return True + def _maintain_glossary_build(self, volume: str = None) -> bool: """Build deduplicated volume glossary JSON files from chapter glossaries.""" book_dir = self.config_manager.book_dir diff --git a/book/cli/commands/validate.py b/book/cli/commands/validate.py index 3995f8ef2..ab64b71b2 100644 --- a/book/cli/commands/validate.py +++ b/book/cli/commands/validate.py @@ -9,9 +9,11 @@ from __future__ import annotations import argparse import json +import os import re import time -from dataclasses import dataclass +from collections import defaultdict +from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Set, Tuple @@ -112,6 +114,16 @@ class ValidateCommand: "duplicate-labels", "unreferenced-labels", "inline-refs", + "section-ids", + "forbidden-footnotes", + "footnotes", + "figure-completeness", + "figure-placement", + "index-placement", + "render-patterns", + "dropcap", + "part-keys", + "image-refs", "all", ], help="Validation command to run", @@ -155,6 +167,16 @@ class ValidateCommand: runs.append(self._run_duplicate_labels(root_path, label_types)) runs.append(self._run_unreferenced_labels(root_path, label_types)) runs.append(self._run_inline_refs(root_path, check_patterns=ns.check_patterns)) + runs.append(self._run_section_ids(root_path)) + runs.append(self._run_forbidden_footnotes(root_path)) + runs.append(self._run_footnotes(root_path)) + runs.append(self._run_figure_completeness(root_path)) + runs.append(self._run_figure_placement(root_path)) + runs.append(self._run_index_placement(root_path)) + runs.append(self._run_render_patterns(root_path)) + runs.append(self._run_dropcap(root_path)) + runs.append(self._run_part_keys(root_path)) + runs.append(self._run_image_refs(root_path)) elif ns.subcommand == "inline-python": runs.append(self._run_inline_python(root_path)) elif ns.subcommand == "refs": @@ -169,6 +191,26 @@ class ValidateCommand: runs.append(self._run_unreferenced_labels(root_path, self._selected_label_types(ns))) elif ns.subcommand == "inline-refs": runs.append(self._run_inline_refs(root_path, check_patterns=ns.check_patterns)) + elif ns.subcommand == "section-ids": + runs.append(self._run_section_ids(root_path)) + elif ns.subcommand == "forbidden-footnotes": + runs.append(self._run_forbidden_footnotes(root_path)) + elif ns.subcommand == "footnotes": + runs.append(self._run_footnotes(root_path)) + elif ns.subcommand == "figure-completeness": + runs.append(self._run_figure_completeness(root_path)) + elif ns.subcommand == "figure-placement": + runs.append(self._run_figure_placement(root_path)) + elif ns.subcommand == "index-placement": + runs.append(self._run_index_placement(root_path)) + elif ns.subcommand == "render-patterns": + runs.append(self._run_render_patterns(root_path)) + elif ns.subcommand == "dropcap": + runs.append(self._run_dropcap(root_path)) + elif ns.subcommand == "part-keys": + runs.append(self._run_part_keys(root_path)) + elif ns.subcommand == "image-refs": + runs.append(self._run_image_refs(root_path)) any_failed = any(not run.passed for run in runs) summary = { @@ -639,6 +681,945 @@ class ValidateCommand: elapsed_ms=int((time.time() - start) * 1000), ) + # ------------------------------------------------------------------ + # Section IDs (ported from manage_section_ids.py --verify) + # ------------------------------------------------------------------ + + def _run_section_ids(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + header_pat = re.compile(r"^(#{1,6})\s+(.+?)(?:\s*\{[^}]*\})?$") + div_start_pat = re.compile(r"^:::\s*\{\.") + div_end_pat = re.compile(r"^:::\s*$") + code_block_pat = re.compile(r"^```[^`]*$") + sec_id_pat = re.compile(r"\{#sec-[^}]+\}") + + for file in files: + lines = self._read_text(file).splitlines() + in_code = False + in_div = False + + for idx, line in enumerate(lines, 1): + stripped = line.strip() + if code_block_pat.match(stripped): + in_code = not in_code + continue + if in_code: + continue + if div_start_pat.match(stripped): + in_div = True + continue + if div_end_pat.match(stripped): + in_div = False + continue + if in_div: + continue + + match = header_pat.match(line) + if not match: + continue + + # Extract existing attributes + existing_attrs = "" + if "{" in line: + attrs_start = line.find("{") + attrs_end = line.rfind("}") + if attrs_end > attrs_start: + existing_attrs = line[attrs_start : attrs_end + 1] + + if ".unnumbered" in existing_attrs: + continue + + if not sec_id_pat.search(line): + title = match.group(2).strip() + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="missing_section_id", + message=f"Header missing section ID: {title}", + severity="error", + context=line.strip()[:160], + ) + ) + + return ValidationRunResult( + name="section-ids", + description="Verify all section headers have {#sec-...} IDs", + files_checked=len(files), + issues=issues, + elapsed_ms=int((time.time() - start) * 1000), + ) + + # ------------------------------------------------------------------ + # Forbidden Footnotes (ported from check_forbidden_footnotes.py) + # ------------------------------------------------------------------ + + def _run_forbidden_footnotes(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + fn_pat = re.compile(r"\[\^fn-[\w-]+\]") + inline_fn_pat = re.compile(r"\^\[[^\]]+\]") + table_sep_pat = re.compile(r"^\|[\s\-:+]+\|") + + for file in files: + lines = self._read_text(file).splitlines() + div_depth = 0 + div_start_line = 0 + + for idx, line in enumerate(lines, 1): + stripped = line.strip() + + # Track div nesting + if re.match(r"^:{3,4}\s*\{", stripped) or re.match(r"^:{3,4}\s+\w", stripped): + div_depth += 1 + if div_depth == 1: + div_start_line = idx + elif re.match(r"^:{3,4}\s*$", stripped): + if div_depth > 0: + div_depth -= 1 + if div_depth == 0: + div_start_line = 0 + + # Check inline footnotes (always forbidden) + for m in inline_fn_pat.finditer(line): + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="inline_footnote", + message=f"Inline footnote syntax; use [^fn-name] reference format", + severity="error", + context=m.group(0)[:80], + ) + ) + + footnotes = fn_pat.findall(line) + if not footnotes: + continue + + # Table cell check + if stripped.startswith("|") and stripped.count("|") >= 2 and not table_sep_pat.match(stripped): + for fn in footnotes: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="footnote_in_table", + message=f"Footnote {fn} in table cell", + severity="error", + context=stripped[:80], + ) + ) + + # YAML caption check + if re.match(r"^\s*(fig-cap|tbl-cap):", line): + cap_type = "figure" if "fig-cap" in line else "table" + for fn in footnotes: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code=f"footnote_in_{cap_type}_caption", + message=f"Footnote {fn} in {cap_type} caption", + severity="error", + context=stripped[:80], + ) + ) + + # Markdown caption check + if re.match(r"^:\s*\*\*[^*]+\*\*:", line): + for fn in footnotes: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="footnote_in_markdown_caption", + message=f"Footnote {fn} in markdown caption", + severity="error", + context=stripped[:80], + ) + ) + + # Callout title check + if re.match(r"^:{3,4}\s*\{.*title=", stripped): + title_match = re.search(r'title="([^"]*)"', line) + if title_match and fn_pat.search(title_match.group(1)): + for fn in fn_pat.findall(title_match.group(1)): + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="footnote_in_callout_title", + message=f"Footnote {fn} in callout title (breaks LaTeX)", + severity="error", + context=stripped[:80], + ) + ) + + # Div block check + if div_depth > 0 and div_start_line != idx: + for fn in footnotes: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="footnote_in_div", + message=f"Footnote {fn} inside div block (started line {div_start_line})", + severity="error", + context=stripped[:80], + ) + ) + + return ValidationRunResult( + name="forbidden-footnotes", + description="Check footnotes in forbidden locations", + files_checked=len(files), + issues=issues, + elapsed_ms=int((time.time() - start) * 1000), + ) + + # ------------------------------------------------------------------ + # Footnote validation (ported from footnote_cleanup.py --validate) + # ------------------------------------------------------------------ + + def _run_footnotes(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + ref_pat = re.compile(r"\[\^([^]]+)\]") + def_pat = re.compile(r"^\[\^([^]]+)\]:\s*(.+)$", re.MULTILINE) + + for file in files: + content = self._read_text(file) + lines = content.split("\n") + + # Collect definitions + fn_defs: Dict[str, str] = {} + for m in def_pat.finditer(content): + fn_defs[m.group(1)] = m.group(2) + + # Collect references (excluding definition lines themselves) + fn_refs: Dict[str, List[int]] = defaultdict(list) + for line_num, line in enumerate(lines, 1): + for m in ref_pat.finditer(line): + fn_id = m.group(1) + dm = def_pat.match(line) + if dm and dm.group(1) == fn_id: + continue # definition line, not a reference + fn_refs[fn_id].append(line_num) + + # Undefined references + for fn_id in sorted(set(fn_refs.keys()) - set(fn_defs.keys())): + first_line = fn_refs[fn_id][0] + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=first_line, + code="undefined_footnote_ref", + message=f"Undefined footnote reference: [^{fn_id}]", + severity="error", + context=f"[^{fn_id}]", + ) + ) + + # Unused definitions + for fn_id in sorted(set(fn_defs.keys()) - set(fn_refs.keys())): + def_line = self._line_for_token(content, f"[^{fn_id}]:") + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=def_line, + code="unused_footnote_def", + message=f"Unused footnote definition: [^{fn_id}]", + severity="warning", + context=f"[^{fn_id}]:", + ) + ) + + # Duplicate definitions + def_counts: Dict[str, int] = defaultdict(int) + for line in lines: + dm = re.match(r"^\[\^([^]]+)\]:", line) + if dm: + def_counts[dm.group(1)] += 1 + for fn_id, count in def_counts.items(): + if count > 1: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=self._line_for_token(content, f"[^{fn_id}]:"), + code="duplicate_footnote_def", + message=f"Duplicate footnote definition ({count}x): [^{fn_id}]", + severity="error", + context=f"[^{fn_id}]:", + ) + ) + + return ValidationRunResult( + name="footnotes", + description="Validate footnote references and definitions", + files_checked=len(files), + issues=issues, + elapsed_ms=int((time.time() - start) * 1000), + ) + + # ------------------------------------------------------------------ + # Figure completeness (ported from check_figure_completeness.py) + # ------------------------------------------------------------------ + + def _run_figure_completeness(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + fig_id_pat = re.compile(r"\{#(fig-[a-zA-Z0-9_-]+)[\s}]") + md_cap_pat = re.compile(r"!\[(.+?)\]\(") + + for file in files: + lines = self._read_text(file).splitlines() + seen_ids: Set[str] = set() + + # Pass 1: attribute-based figures + for idx, line in enumerate(lines, 1): + m = fig_id_pat.search(line) + if not m: + continue + fig_id = m.group(1) + has_cap = bool(re.search(r'fig-cap="[^"]+', line)) + has_alt = bool(re.search(r'fig-alt="[^"]+', line)) + + if "![" in line: + md_m = md_cap_pat.search(line) + if md_m and md_m.group(1).strip(): + has_cap = True + + seen_ids.add(fig_id) + missing = [] + if not has_cap: + missing.append("caption") + if not has_alt: + missing.append("alt-text") + if missing: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code="incomplete_figure", + message=f"Figure {fig_id} missing: {', '.join(missing)}", + severity="error", + context=line.strip()[:120], + ) + ) + + # Pass 2: code-cell figures + in_code = False + code_start = 0 + cell_opts: Dict[str, str] = {} + for idx, line in enumerate(lines, 1): + stripped = line.rstrip() + if not in_code and re.match(r"^```\{(?:python|r|julia|ojs)", stripped): + in_code = True + code_start = idx + cell_opts = {} + continue + if in_code and stripped == "```": + label = cell_opts.get("label", "") + if label.startswith("fig-") and label not in seen_ids: + cap_val = cell_opts.get("fig-cap", "") + alt_val = cell_opts.get("fig-alt", "") + missing = [] + if not cap_val: + missing.append("caption") + if not alt_val: + missing.append("alt-text") + if missing: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=code_start, + code="incomplete_figure", + message=f"Figure {label} missing: {', '.join(missing)}", + severity="error", + context=f"code-cell figure {label}", + ) + ) + seen_ids.add(label) + in_code = False + cell_opts = {} + continue + if in_code: + opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped) + if opt_m: + val = opt_m.group(2).strip().strip("\"'") + cell_opts[opt_m.group(1)] = val + + return ValidationRunResult( + name="figure-completeness", + description="Check figures have captions and alt-text", + files_checked=len(files), + issues=issues, + elapsed_ms=int((time.time() - start) * 1000), + ) + + # ------------------------------------------------------------------ + # Figure/table placement (ported from figure_table_flow_audit.py) + # ------------------------------------------------------------------ + + def _run_figure_placement(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + div_def_pat = re.compile(r":::\s*\{[^}]*#((?:fig|tbl)-[\w-]+)") + img_def_pat = re.compile(r"!\[.*?\]\(.*?\)\s*\{[^}]*#((?:fig|tbl)-[\w-]+)") + tbl_cap_pat = re.compile(r"^:\s+.*\{[^}]*#((?:fig|tbl)-[\w-]+)") + ref_pat = re.compile(r"@((?:fig|tbl)-[\w-]+)") + + for file in files: + lines = self._read_text(file).splitlines() + defs: Dict[str, int] = {} + refs: Dict[str, List[int]] = defaultdict(list) + in_code = False + in_float = False + float_label: Optional[str] = None + code_spans: List[Tuple[int, int]] = [] + code_start = 0 + cell_opts: Dict[str, str] = {} + + for idx, line in enumerate(lines, 1): + stripped = line.rstrip() + + # Code block tracking + if not in_code and re.match(r"^```\{", stripped): + in_code = True + code_start = idx + cell_opts = {} + continue + if in_code and stripped == "```": + code_spans.append((code_start, idx)) + label = cell_opts.get("label", "") + if label.startswith(("fig-", "tbl-")) and label not in defs: + defs[label] = code_start + in_code = False + cell_opts = {} + continue + if in_code: + opt_m = re.match(r"^#\|\s*([\w-]+):\s*(.+)$", stripped) + if opt_m: + cell_opts[opt_m.group(1)] = opt_m.group(2).strip().strip("\"'") + continue + + # Attribute-based definitions + for pat in [div_def_pat, img_def_pat, tbl_cap_pat]: + m = pat.search(line) + if m: + label = m.group(1) + if label not in defs: + defs[label] = idx + if pat == div_def_pat: + in_float = True + float_label = label + + # Track float block end + if in_float: + ls = line.strip() + if ls.startswith(":::") and not ls.startswith("::: {"): + in_float = False + float_label = None + + # References + if "fig-cap=" in line or "fig-alt=" in line: + continue + for m in ref_pat.finditer(line): + label = m.group(1) + if in_float and label == float_label: + continue + refs[label].append(idx) + + # Evaluate status + all_labels = set(defs.keys()) | set(refs.keys()) + for label in sorted(all_labels): + def_line = defs.get(label) + ref_lines = refs.get(label, []) + first_ref = min(ref_lines) if ref_lines else None + + if not def_line: + continue # XREF — informational, skip + if not first_ref: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=def_line, + code="orphan_float", + message=f"{'Figure' if label.startswith('fig-') else 'Table'} {label} defined but never referenced", + severity="warning", + context=label, + ) + ) + continue + + # Compute prose gap + gap = def_line - first_ref + code_lines = 0 + if gap > 0: + for cs, ce in code_spans: + os_ = max(first_ref, cs) + oe_ = min(def_line, ce) + if os_ <= oe_: + code_lines += oe_ - os_ + 1 + prose_gap = gap - code_lines + + if prose_gap > 30: + # Check closest reference + closest = min(ref_lines, key=lambda r: abs(def_line - r)) + closest_gap = def_line - closest + if -5 <= closest_gap <= 30: + continue # OK + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=def_line, + code="late_float", + message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (too far after mention)", + severity="warning", + context=label, + ) + ) + elif prose_gap < -5: + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=def_line, + code="early_float", + message=f"{label} defined at L{def_line}, first referenced at L{first_ref} (appears before mention)", + severity="warning", + context=label, + ) + ) + + return ValidationRunResult( + name="figure-placement", + description="Audit figure/table placement relative to first reference", + files_checked=len(files), + issues=issues, + elapsed_ms=int((time.time() - start) * 1000), + ) + + # ------------------------------------------------------------------ + # Index placement (ported from check_index_placement.py) + # ------------------------------------------------------------------ + + def _run_index_placement(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + checks = [ + ("index_on_heading", re.compile(r"^#{1,6}\s+.*\\index\{"), "\\index{} on same line as heading"), + ("index_before_div", re.compile(r"\\index\{[^}]*\}:::"), "\\index{} directly before ::: (div/callout)"), + ("index_after_div", re.compile(r"^::+\s+\{[^}]*\}\s*\\index\{"), "\\index{} on same line as div/callout"), + ("index_before_footnote", re.compile(r"^\\index\{[^}]*\}.*\[\^[^\]]+\]:"), "\\index{} before footnote definition"), + ] + + for file in files: + lines = self._read_text(file).splitlines() + in_code = False + for idx, line in enumerate(lines, 1): + if line.strip().startswith("```"): + in_code = not in_code + continue + if in_code: + continue + + for code, pattern, message in checks: + # Skip fig-cap lines for index_after_div + if code == "index_after_div" and "fig-cap=" in line: + continue + if pattern.search(line): + issues.append( + ValidationIssue( + file=self._relative_file(file), + line=idx, + code=code, + message=message, + severity="error", + context=line.strip()[:120], + ) + ) + + return ValidationRunResult( + name="index-placement", + description="Check LaTeX \\index{} placement", + files_checked=len(files), + issues=issues, + elapsed_ms=int((time.time() - start) * 1000), + ) + + # ------------------------------------------------------------------ + # Render patterns (ported from check_render_patterns.py) + # ------------------------------------------------------------------ + + def _run_render_patterns(self, root: Path) -> ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + regex_checks = [ + ("missing_opening_backtick", re.compile(r"(? ValidationRunResult: + start = time.time() + files = self._qmd_files(root) + issues: List[ValidationIssue] = [] + + chapter_hdr = re.compile(r"^#\s+[^#].*\{#sec-") + numbered_h2 = re.compile(r"^##\s+[^#]") + unnumbered_h2 = re.compile(r"^##\s+.*\{.*\.unnumbered.*\}") + starts_xref = re.compile(r"^\s*@(sec|fig|tbl|lst|eq)-") + starts_link = re.compile(r"^\s*\[") + starts_inline = re.compile(r"^\s*`") + yaml_fence = re.compile(r"^---\s*$") + code_fence = re.compile(r"^```") + div_fence = re.compile(r"^:::") + blank = re.compile(r"^\s*$") + html_comment = re.compile(r"^\s*