mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 17:20:21 -05:00
Remove redundant ml_ prefix from ml_workflow chapter files and update all Quarto config references. Consolidate custom scripts into native binder subcommands and archive obsolete tooling.
1125 lines
43 KiB
Python
1125 lines
43 KiB
Python
"""
|
|
``binder info`` — Book statistics and figure extraction.
|
|
|
|
Subcommands:
|
|
stats — Count figures, tables, equations, listings, text lines, words
|
|
figures — Extract figure list with labels, captions, and alt-text
|
|
Use --with-pdf to merge LaTeX figure numbers and page numbers
|
|
from a previous PDF build.
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Patterns
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_FIGURE_DEF = [
|
|
re.compile(r"\{#(fig-[\w-]+)"),
|
|
re.compile(r"#\|\s*label:\s*(fig-[\w-]+)"),
|
|
]
|
|
_TABLE_DEF = [re.compile(r"\{#(tbl-[\w-]+)")]
|
|
_EQUATION_DEF = [re.compile(r"\{#(eq-[\w-]+)")]
|
|
_LISTING_DEF = [re.compile(r"\{#(lst-[\w-]+)")]
|
|
_SECTION_DEF = [re.compile(r"\{#(sec-[\w-]+)")]
|
|
|
|
_CODE_FENCE = re.compile(r"^```")
|
|
_YAML_FENCE = re.compile(r"^---\s*$")
|
|
|
|
# Figure extraction patterns — greedy .* anchored to end-of-line so interior
|
|
# braces from LaTeX (e.g. $W_{hh}$, \mathcal{F}) don't truncate the match.
|
|
_DIV_FIG = re.compile(
|
|
r"^:{3,}\s*\{(.*#fig-.*)\}\s*$", re.MULTILINE
|
|
)
|
|
_IMG_FIG = re.compile(
|
|
r"!\[((?:[^\[\]]|\[[^\]]*\])*)\]"
|
|
r"\([^)]+\)"
|
|
r"\{(.*#fig-[^\n]*)\}\s*$",
|
|
re.MULTILINE,
|
|
)
|
|
_ATTR_CAP = re.compile(r'fig-cap\s*=\s*"([^"]*)"')
|
|
_ATTR_ALT = re.compile(r'fig-alt\s*=\s*"([^"]*)"')
|
|
|
|
# Code-block figure pattern: ```{python} with #| cell options
|
|
_CODE_BLOCK_FIG = re.compile(
|
|
r"```\{(?:python|r|julia|ojs)\}[^\n]*\n"
|
|
r"((?:#\|[^\n]*\n)+)",
|
|
re.MULTILINE,
|
|
)
|
|
|
|
# LaTeX manifest
|
|
_MANIFEST_HEADER = "LATEX FIGURE MANIFEST"
|
|
_LATEX_FIG_PAT = re.compile(r"Figure\s+([A-Z\d]+\.\d+)\s*\|\s*Page\s*(\d+)")
|
|
|
|
|
|
class InfoCommand:
|
|
"""Native ``binder info`` command group."""
|
|
|
|
def __init__(self, config_manager, chapter_discovery):
|
|
self.config_manager = config_manager
|
|
self.chapter_discovery = chapter_discovery
|
|
|
|
# ------------------------------------------------------------------
|
|
# Entry point
|
|
# ------------------------------------------------------------------
|
|
|
|
def run(self, args: List[str]) -> bool:
|
|
parser = argparse.ArgumentParser(
|
|
prog="binder info",
|
|
description="Book statistics and figure extraction",
|
|
)
|
|
parser.add_argument(
|
|
"subcommand",
|
|
nargs="?",
|
|
choices=["stats", "figures", "concepts", "headers", "acronyms"],
|
|
help="Subcommand to run",
|
|
)
|
|
parser.add_argument("--path", default=None, help="File or directory")
|
|
parser.add_argument("--vol1", action="store_true", help="Volume I only")
|
|
parser.add_argument("--vol2", action="store_true", help="Volume II only")
|
|
parser.add_argument(
|
|
"--format",
|
|
choices=["text", "markdown", "csv"],
|
|
default="text",
|
|
help="Output format for figures (default: text)",
|
|
)
|
|
parser.add_argument("--output", default=None, help="Write to file")
|
|
parser.add_argument("--json", action="store_true", help="JSON output (stats)")
|
|
parser.add_argument(
|
|
"--by-chapter",
|
|
action="store_true",
|
|
help="Break down stats per chapter",
|
|
)
|
|
parser.add_argument(
|
|
"--with-pdf",
|
|
action="store_true",
|
|
help="Merge LaTeX figure numbers/pages from a previous PDF build",
|
|
)
|
|
parser.add_argument(
|
|
"--manifest",
|
|
default=None,
|
|
help="Path to LaTeX figure manifest (*_figures.txt). Auto-detected if omitted.",
|
|
)
|
|
|
|
try:
|
|
ns = parser.parse_args(args)
|
|
except SystemExit:
|
|
return ("-h" in args) or ("--help" in args)
|
|
|
|
if not ns.subcommand:
|
|
self._print_help()
|
|
return True
|
|
|
|
root = self._resolve_path(ns.path, ns.vol1, ns.vol2)
|
|
if not root.exists():
|
|
console.print(f"[red]Path not found: {root}[/red]")
|
|
return False
|
|
|
|
if ns.subcommand == "stats":
|
|
return self._run_stats(root, ns)
|
|
elif ns.subcommand == "figures":
|
|
return self._run_figures(root, ns)
|
|
elif ns.subcommand == "concepts":
|
|
return self._run_concepts(root, ns)
|
|
elif ns.subcommand == "headers":
|
|
return self._run_headers(root, ns)
|
|
elif ns.subcommand == "acronyms":
|
|
return self._run_acronyms(root, ns)
|
|
return False
|
|
|
|
# ------------------------------------------------------------------
|
|
# Help
|
|
# ------------------------------------------------------------------
|
|
|
|
def _print_help(self) -> None:
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("Subcommand", style="cyan", width=14)
|
|
table.add_column("Description", style="white", width=50)
|
|
table.add_row("stats", "Count figures, tables, equations, listings, words, text lines")
|
|
table.add_row("figures", "Extract figure list with labels, captions, and alt-text")
|
|
table.add_row("concepts", "Extract key concepts (bold terms, definitions, headers)")
|
|
table.add_row("headers", "List all section headers with levels")
|
|
table.add_row("acronyms", "Find acronyms in parentheses (e.g., (CNN))")
|
|
console.print(Panel(table, title="binder info <subcommand>", border_style="cyan"))
|
|
console.print("[dim]Examples:[/dim]")
|
|
console.print(" [cyan]./binder info stats --vol1[/cyan]")
|
|
console.print(" [cyan]./binder info stats --by-chapter[/cyan]")
|
|
console.print(" [cyan]./binder info figures --vol1[/cyan]")
|
|
console.print(" [cyan]./binder info figures --vol1 --with-pdf[/cyan] [dim]# merge LaTeX fig numbers + pages[/dim]")
|
|
console.print(" [cyan]./binder info concepts --vol1[/cyan]")
|
|
console.print(" [cyan]./binder info headers --vol1[/cyan]")
|
|
console.print(" [cyan]./binder info acronyms --vol1[/cyan]")
|
|
console.print()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Path resolution
|
|
# ------------------------------------------------------------------
|
|
|
|
def _resolve_path(self, path_arg: Optional[str], vol1: bool, vol2: bool) -> Path:
|
|
if path_arg:
|
|
p = Path(path_arg)
|
|
return p if p.is_absolute() else Path.cwd() / p
|
|
base = self.config_manager.book_dir
|
|
if vol1:
|
|
return base / "contents" / "vol1"
|
|
if vol2:
|
|
return base / "contents" / "vol2"
|
|
return base / "contents"
|
|
|
|
def _qmd_files(self, root: Path) -> List[Path]:
|
|
if root.is_file():
|
|
return [root] if root.suffix == ".qmd" else []
|
|
return sorted(root.rglob("*.qmd"))
|
|
|
|
def _chapter_name(self, path: Path) -> str:
|
|
"""Extract a human-readable chapter name from a QMD file."""
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return path.stem
|
|
# Skip YAML front matter
|
|
if content.startswith("---"):
|
|
end = content.find("---", 3)
|
|
if end != -1:
|
|
content = content[end + 3:]
|
|
m = re.search(r"^#\s+([^{\n]+)", content, re.MULTILINE)
|
|
if m:
|
|
return m.group(1).strip()
|
|
return path.stem.replace("_", " ").title()
|
|
|
|
def _relative(self, path: Path) -> str:
|
|
try:
|
|
return str(path.relative_to(self.config_manager.book_dir))
|
|
except ValueError:
|
|
return str(path)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Stats
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_stats(self, root: Path, ns: argparse.Namespace) -> bool:
|
|
files = self._qmd_files(root)
|
|
if not files:
|
|
console.print("[yellow]No QMD files found.[/yellow]")
|
|
return False
|
|
|
|
# Filter out parts/, frontmatter/ for cleaner stats
|
|
chapter_files = [
|
|
f for f in files
|
|
if "/parts/" not in str(f) and "/frontmatter/" not in str(f)
|
|
]
|
|
|
|
all_stats: List[Dict] = []
|
|
for path in chapter_files:
|
|
stats = self._count_file(path)
|
|
stats["file"] = self._relative(path)
|
|
stats["chapter"] = self._chapter_name(path)
|
|
all_stats.append(stats)
|
|
|
|
if ns.json:
|
|
import json
|
|
totals = self._aggregate(all_stats)
|
|
payload = {
|
|
"path": str(root),
|
|
"files": len(all_stats),
|
|
"totals": totals,
|
|
}
|
|
if ns.by_chapter:
|
|
payload["chapters"] = all_stats
|
|
print(json.dumps(payload, indent=2))
|
|
return True
|
|
|
|
# Rich table output
|
|
totals = self._aggregate(all_stats)
|
|
|
|
if ns.by_chapter:
|
|
self._print_chapter_stats(all_stats, totals)
|
|
else:
|
|
self._print_summary_stats(all_stats, totals, root)
|
|
|
|
return True
|
|
|
|
def _count_file(self, path: Path) -> Dict:
|
|
"""Count elements in a single QMD file."""
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return self._empty_stats()
|
|
|
|
lines = content.splitlines()
|
|
in_code = False
|
|
in_yaml = False
|
|
text_lines = 0
|
|
word_count = 0
|
|
code_blocks = 0
|
|
figures = 0
|
|
tables = 0
|
|
equations = 0
|
|
listings = 0
|
|
sections = 0
|
|
footnotes = 0
|
|
citations = 0
|
|
|
|
citation_pat = re.compile(r"@[\w-]+")
|
|
footnote_def_pat = re.compile(r"^\[\^[^\]]+\]:")
|
|
|
|
for idx, line in enumerate(lines):
|
|
stripped = line.strip()
|
|
|
|
# YAML front matter
|
|
if idx == 0 and _YAML_FENCE.match(stripped):
|
|
in_yaml = True
|
|
continue
|
|
if in_yaml:
|
|
if _YAML_FENCE.match(stripped):
|
|
in_yaml = False
|
|
continue
|
|
|
|
# Code blocks
|
|
if _CODE_FENCE.match(stripped):
|
|
if not in_code:
|
|
code_blocks += 1
|
|
in_code = not in_code
|
|
continue
|
|
if in_code:
|
|
# Count figure labels inside code blocks
|
|
for pat in _FIGURE_DEF:
|
|
figures += len(pat.findall(line))
|
|
continue
|
|
|
|
# Blank lines don't count as text
|
|
if not stripped:
|
|
continue
|
|
|
|
# Skip div fences, HTML comments, raw LaTeX for text counting
|
|
if stripped.startswith(":::") or stripped.startswith("<!--"):
|
|
# But still check for labels in div lines
|
|
for pat in _FIGURE_DEF:
|
|
figures += len(pat.findall(line))
|
|
for pat in _TABLE_DEF:
|
|
tables += len(pat.findall(line))
|
|
for pat in _LISTING_DEF:
|
|
listings += len(pat.findall(line))
|
|
continue
|
|
|
|
# Count definitions
|
|
for pat in _FIGURE_DEF:
|
|
figures += len(pat.findall(line))
|
|
for pat in _TABLE_DEF:
|
|
tables += len(pat.findall(line))
|
|
for pat in _EQUATION_DEF:
|
|
equations += len(pat.findall(line))
|
|
for pat in _LISTING_DEF:
|
|
listings += len(pat.findall(line))
|
|
for pat in _SECTION_DEF:
|
|
sections += len(pat.findall(line))
|
|
|
|
# Footnote definitions
|
|
if footnote_def_pat.match(stripped):
|
|
footnotes += 1
|
|
|
|
# Citations (rough count of unique @cite patterns, excluding cross-refs)
|
|
for cm in citation_pat.finditer(line):
|
|
ref = cm.group(0)[1:] # strip @
|
|
if not any(ref.startswith(p) for p in ("fig-", "tbl-", "sec-", "eq-", "lst-")):
|
|
citations += 1
|
|
|
|
# Text lines and words (prose lines only)
|
|
# Skip heading attribute lines, pipe table separators, etc.
|
|
if stripped.startswith("#|") or stripped.startswith("%%|"):
|
|
continue
|
|
if stripped.startswith("|") and set(stripped.replace("|", "").strip()) <= {"-", ":", " "}:
|
|
continue # table separator line
|
|
|
|
text_lines += 1
|
|
# Word count: strip markdown formatting, count words
|
|
clean = re.sub(r"`\{python\}[^`]*`", "PYVAL", stripped)
|
|
clean = re.sub(r"\{[^}]+\}", "", clean) # strip attributes
|
|
clean = re.sub(r"[#*_`~\[\](){}|>]", " ", clean)
|
|
words = [w for w in clean.split() if len(w) > 0 and not w.startswith("\\")]
|
|
word_count += len(words)
|
|
|
|
return {
|
|
"figures": figures,
|
|
"tables": tables,
|
|
"equations": equations,
|
|
"listings": listings,
|
|
"sections": sections,
|
|
"footnotes": footnotes,
|
|
"citations": citations,
|
|
"code_blocks": code_blocks,
|
|
"text_lines": text_lines,
|
|
"words": word_count,
|
|
}
|
|
|
|
@staticmethod
|
|
def _empty_stats() -> Dict:
|
|
return {
|
|
"figures": 0, "tables": 0, "equations": 0, "listings": 0,
|
|
"sections": 0, "footnotes": 0, "citations": 0,
|
|
"code_blocks": 0, "text_lines": 0, "words": 0,
|
|
}
|
|
|
|
@staticmethod
|
|
def _aggregate(stats_list: List[Dict]) -> Dict:
|
|
totals: Dict = {}
|
|
for s in stats_list:
|
|
for k, v in s.items():
|
|
if isinstance(v, int):
|
|
totals[k] = totals.get(k, 0) + v
|
|
return totals
|
|
|
|
def _print_summary_stats(self, all_stats, totals, root) -> None:
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("Metric", style="cyan", width=20)
|
|
table.add_column("Count", style="white", justify="right", width=10)
|
|
|
|
table.add_row("Chapters", str(len(all_stats)))
|
|
table.add_row("Sections", f"{totals['sections']:,}")
|
|
table.add_row("Figures", f"{totals['figures']:,}")
|
|
table.add_row("Tables", f"{totals['tables']:,}")
|
|
table.add_row("Equations", f"{totals['equations']:,}")
|
|
table.add_row("Listings", f"{totals['listings']:,}")
|
|
table.add_row("Footnotes", f"{totals['footnotes']:,}")
|
|
table.add_row("Citations", f"{totals['citations']:,}")
|
|
table.add_row("Code blocks", f"{totals['code_blocks']:,}")
|
|
table.add_row("Text lines", f"{totals['text_lines']:,}")
|
|
table.add_row("Words (approx)", f"{totals['words']:,}")
|
|
|
|
scope = str(root)
|
|
try:
|
|
scope = str(root.relative_to(self.config_manager.book_dir))
|
|
except ValueError:
|
|
pass
|
|
console.print(Panel(table, title=f"Book Statistics — {scope}", border_style="cyan"))
|
|
|
|
def _print_chapter_stats(self, all_stats, totals) -> None:
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("Chapter", style="white", width=32, no_wrap=True)
|
|
table.add_column("Fig", justify="right", width=5)
|
|
table.add_column("Tbl", justify="right", width=5)
|
|
table.add_column("Eq", justify="right", width=5)
|
|
table.add_column("Lst", justify="right", width=5)
|
|
table.add_column("Sec", justify="right", width=5)
|
|
table.add_column("Fn", justify="right", width=5)
|
|
table.add_column("Cite", justify="right", width=6)
|
|
table.add_column("Words", justify="right", width=8)
|
|
|
|
for s in all_stats:
|
|
name = s["chapter"]
|
|
if len(name) > 30:
|
|
name = name[:28] + "…"
|
|
table.add_row(
|
|
name,
|
|
str(s["figures"]),
|
|
str(s["tables"]),
|
|
str(s["equations"]),
|
|
str(s["listings"]),
|
|
str(s["sections"]),
|
|
str(s["footnotes"]),
|
|
str(s["citations"]),
|
|
f"{s['words']:,}",
|
|
)
|
|
|
|
# Totals row
|
|
table.add_row(
|
|
"[bold]TOTAL[/bold]",
|
|
f"[bold]{totals['figures']}[/bold]",
|
|
f"[bold]{totals['tables']}[/bold]",
|
|
f"[bold]{totals['equations']}[/bold]",
|
|
f"[bold]{totals['listings']}[/bold]",
|
|
f"[bold]{totals['sections']}[/bold]",
|
|
f"[bold]{totals['footnotes']}[/bold]",
|
|
f"[bold]{totals['citations']}[/bold]",
|
|
f"[bold]{totals['words']:,}[/bold]",
|
|
)
|
|
|
|
console.print(Panel(table, title="Book Statistics — By Chapter", border_style="cyan"))
|
|
|
|
# ------------------------------------------------------------------
|
|
# Figures
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_figures(self, root: Path, ns: argparse.Namespace) -> bool:
|
|
with_pdf = getattr(ns, "with_pdf", False)
|
|
|
|
# Determine chapter file list
|
|
if with_pdf:
|
|
# Use config-ordered list so sequential merge with LaTeX works
|
|
vol = "vol1" if ns.vol1 else ("vol2" if ns.vol2 else None)
|
|
chapter_files = self._config_ordered_chapters(vol)
|
|
if not chapter_files:
|
|
console.print("[yellow]Could not read chapter order from config. Falling back to directory scan.[/yellow]")
|
|
chapter_files = self._chapter_files_from_root(root)
|
|
else:
|
|
chapter_files = self._chapter_files_from_root(root)
|
|
|
|
if not chapter_files:
|
|
console.print("[yellow]No QMD files found.[/yellow]")
|
|
return False
|
|
|
|
# Extract QMD figures
|
|
all_figures: List[Dict] = []
|
|
for path in chapter_files:
|
|
chapter = self._chapter_name(path)
|
|
figs = self._extract_figures(path)
|
|
for i, fig in enumerate(figs, 1):
|
|
fig["chapter"] = chapter
|
|
fig["file"] = self._relative(path)
|
|
fig["seq"] = i # per-chapter sequential number
|
|
all_figures.extend(figs)
|
|
|
|
if not all_figures:
|
|
console.print("[yellow]No figures found.[/yellow]")
|
|
return True
|
|
|
|
# Merge with LaTeX manifest if requested
|
|
if with_pdf:
|
|
vol = "vol1" if ns.vol1 else ("vol2" if ns.vol2 else None)
|
|
manifest_path = self._find_latex_manifest(ns.manifest, vol)
|
|
if manifest_path:
|
|
latex_figs = self._parse_latex_manifest(manifest_path)
|
|
console.print(
|
|
f"[dim]LaTeX manifest: {manifest_path.name} "
|
|
f"({len(latex_figs)} figures)[/dim]"
|
|
)
|
|
if len(latex_figs) != len(all_figures):
|
|
console.print(
|
|
f"[yellow]Warning: LaTeX ({len(latex_figs)}) and QMD "
|
|
f"({len(all_figures)}) figure counts differ. "
|
|
f"Manifest may be stale — rebuild PDF to fix.[/yellow]"
|
|
)
|
|
# Merge by sequential position
|
|
for i, fig in enumerate(all_figures):
|
|
if i < len(latex_figs):
|
|
fig["fig_number"] = latex_figs[i]["number"]
|
|
fig["page"] = latex_figs[i]["page"]
|
|
else:
|
|
fig["fig_number"] = "?"
|
|
fig["page"] = "?"
|
|
else:
|
|
console.print(
|
|
"[yellow]No LaTeX manifest found. Build PDF first, "
|
|
"or pass --manifest PATH.[/yellow]"
|
|
)
|
|
for fig in all_figures:
|
|
fig["fig_number"] = "?"
|
|
fig["page"] = "?"
|
|
else:
|
|
# No PDF data — assign per-chapter sequential numbers
|
|
for fig in all_figures:
|
|
fig["fig_number"] = ""
|
|
fig["page"] = ""
|
|
|
|
output = self._format_figures(all_figures, ns.format, with_pdf)
|
|
|
|
if ns.output:
|
|
Path(ns.output).write_text(output, encoding="utf-8")
|
|
console.print(f"[green]Wrote {len(all_figures)} figures to {ns.output}[/green]")
|
|
else:
|
|
if ns.format == "text":
|
|
self._print_figures_rich(all_figures, with_pdf)
|
|
else:
|
|
print(output)
|
|
|
|
return True
|
|
|
|
def _chapter_files_from_root(self, root: Path) -> List[Path]:
|
|
"""Get chapter QMD files from a directory, filtering out parts/."""
|
|
files = self._qmd_files(root)
|
|
return [f for f in files if "/parts/" not in str(f)]
|
|
|
|
# ------------------------------------------------------------------
|
|
# Config reading
|
|
# ------------------------------------------------------------------
|
|
|
|
def _pdf_config_paths(self, vol: Optional[str] = None) -> List[Path]:
|
|
"""Return candidate PDF config YAML paths in priority order."""
|
|
quarto_dir = self.config_manager.book_dir
|
|
vol_str = vol or "vol1"
|
|
return [
|
|
quarto_dir / "_quarto.yml",
|
|
quarto_dir / f"config/_quarto-pdf-{vol_str}.yml",
|
|
quarto_dir / f"config/_quarto-pdf-{vol_str}-copyedit.yml",
|
|
]
|
|
|
|
def _read_pdf_config(self, vol: Optional[str] = None) -> Tuple[Optional[Path], Optional[dict], str]:
|
|
"""Read the first valid PDF config for *vol*.
|
|
|
|
Returns (config_path, parsed_yaml, raw_text). Any element may
|
|
be ``None`` / empty if no config was found.
|
|
"""
|
|
import yaml
|
|
|
|
for config_path in self._pdf_config_paths(vol):
|
|
if not config_path.exists():
|
|
continue
|
|
resolved = config_path.resolve() if config_path.is_symlink() else config_path
|
|
try:
|
|
raw = resolved.read_text(encoding="utf-8")
|
|
parsed = yaml.safe_load(raw)
|
|
except Exception:
|
|
continue
|
|
if parsed and "book" in parsed:
|
|
return resolved, parsed, raw
|
|
return None, None, ""
|
|
|
|
def _output_dirs_from_configs(self, vol: Optional[str] = None) -> List[Path]:
|
|
"""Return all ``output-dir`` values declared in the PDF configs for *vol*.
|
|
|
|
This is the authoritative way to locate build artifacts — the
|
|
YAML configs define exactly where Quarto writes its output.
|
|
"""
|
|
import yaml
|
|
|
|
quarto_dir = self.config_manager.book_dir
|
|
dirs: List[Path] = []
|
|
|
|
for config_path in self._pdf_config_paths(vol):
|
|
if not config_path.exists():
|
|
continue
|
|
resolved = config_path.resolve() if config_path.is_symlink() else config_path
|
|
try:
|
|
parsed = yaml.safe_load(resolved.read_text(encoding="utf-8"))
|
|
except Exception:
|
|
continue
|
|
if not parsed:
|
|
continue
|
|
output_dir = (parsed.get("project") or {}).get("output-dir")
|
|
if output_dir:
|
|
full = quarto_dir / output_dir
|
|
if full not in dirs:
|
|
dirs.append(full)
|
|
|
|
return dirs
|
|
|
|
def _config_ordered_chapters(self, vol: Optional[str] = None) -> List[Path]:
|
|
"""Read chapter order from Quarto YAML config (same order as PDF build).
|
|
|
|
This is critical for --with-pdf because the LaTeX manifest is
|
|
sequential — Figure N in the manifest corresponds to the Nth
|
|
figure extracted from the QMD files in config order.
|
|
"""
|
|
quarto_dir = self.config_manager.book_dir
|
|
vol_str = vol or "vol1"
|
|
|
|
_, _, raw = self._read_pdf_config(vol)
|
|
if not raw:
|
|
return []
|
|
|
|
qmd_files: List[Path] = []
|
|
|
|
# Read both commented and uncommented chapter entries (full intended order)
|
|
# Pattern: lines like " - contents/vol1/chapter/chapter.qmd"
|
|
# or " # - contents/vol1/chapter/chapter.qmd"
|
|
comment_pat = re.compile(
|
|
rf"^\s*#?\s*-\s*(contents/{vol_str}/[^\s#]+\.qmd)\s*$",
|
|
re.MULTILINE,
|
|
)
|
|
for m in comment_pat.finditer(raw):
|
|
rel = m.group(1)
|
|
# Skip parts, frontmatter, shelved
|
|
if "/parts/" in rel or "/frontmatter/" in rel or "_shelved" in rel:
|
|
continue
|
|
full = quarto_dir / rel
|
|
if full.exists():
|
|
qmd_files.append(full)
|
|
|
|
return qmd_files
|
|
|
|
# ------------------------------------------------------------------
|
|
# LaTeX manifest
|
|
# ------------------------------------------------------------------
|
|
|
|
@staticmethod
|
|
def _is_latex_manifest(path: Path) -> bool:
|
|
try:
|
|
return _MANIFEST_HEADER in path.read_text(encoding="utf-8")[:200]
|
|
except Exception:
|
|
return False
|
|
|
|
def _find_latex_manifest(
|
|
self, explicit_path: Optional[str], vol: Optional[str]
|
|
) -> Optional[Path]:
|
|
"""Locate the LaTeX figure manifest (*_figures.txt).
|
|
|
|
Search order:
|
|
1. Explicit --manifest path
|
|
2. ``output-dir`` directories declared in the PDF YAML configs
|
|
3. Quarto root directory (fresh from LaTeX, not yet moved)
|
|
"""
|
|
if explicit_path:
|
|
p = Path(explicit_path)
|
|
if not p.is_absolute():
|
|
p = Path.cwd() / p
|
|
return p if p.exists() else None
|
|
|
|
quarto_dir = self.config_manager.book_dir
|
|
candidates: List[Path] = []
|
|
|
|
# Search the output directories declared in the YAML configs
|
|
for output_dir in self._output_dirs_from_configs(vol):
|
|
if output_dir.exists():
|
|
for f in output_dir.glob("*_figures.txt"):
|
|
if self._is_latex_manifest(f):
|
|
candidates.append(f)
|
|
|
|
# Fallback: quarto root (LaTeX writes here during compilation,
|
|
# before the post-render script moves it into the build dir)
|
|
for f in quarto_dir.glob("*_figures.txt"):
|
|
if self._is_latex_manifest(f):
|
|
candidates.append(f)
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
# Most recently modified wins
|
|
candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
return candidates[0]
|
|
|
|
@staticmethod
|
|
def _parse_latex_manifest(path: Path) -> List[Dict]:
|
|
"""Parse a LaTeX figure manifest into [{number, page}, ...]."""
|
|
figures: List[Dict] = []
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return figures
|
|
for m in _LATEX_FIG_PAT.finditer(content):
|
|
figures.append({"number": m.group(1), "page": m.group(2)})
|
|
return figures
|
|
|
|
# ------------------------------------------------------------------
|
|
# Figure extraction from QMD
|
|
# ------------------------------------------------------------------
|
|
|
|
def _extract_figures(self, path: Path) -> List[Dict]:
|
|
"""Extract all figures from a QMD file, deduplicating by label."""
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return []
|
|
|
|
raw: List[Dict] = []
|
|
|
|
# 1. Fenced divs: ::: {#fig-id fig-cap="..." fig-alt="..."}
|
|
for m in _DIV_FIG.finditer(content):
|
|
attrs = m.group(1)
|
|
label_m = re.search(r"#(fig-[\w-]+)", attrs)
|
|
if not label_m:
|
|
continue
|
|
fig_id = label_m.group(1)
|
|
cap_m = _ATTR_CAP.search(attrs)
|
|
alt_m = _ATTR_ALT.search(attrs)
|
|
caption = self._unescape(cap_m.group(1)) if cap_m else ""
|
|
alt_text = self._unescape(alt_m.group(1)) if alt_m else ""
|
|
raw.append({
|
|
"id": fig_id,
|
|
"label": self._extract_title(caption),
|
|
"caption": caption,
|
|
"alt_text": alt_text,
|
|
"source": "div",
|
|
"pos": m.start(),
|
|
})
|
|
|
|
# 2. Markdown images: {#fig-id fig-alt="..."}
|
|
for m in _IMG_FIG.finditer(content):
|
|
attrs = m.group(2)
|
|
label_m = re.search(r"#(fig-[\w-]+)", attrs)
|
|
if not label_m:
|
|
continue
|
|
fig_id = label_m.group(1)
|
|
caption = m.group(1).strip()
|
|
cap_m = _ATTR_CAP.search(attrs)
|
|
alt_m = _ATTR_ALT.search(attrs)
|
|
if cap_m:
|
|
caption = self._unescape(cap_m.group(1))
|
|
alt_text = self._unescape(alt_m.group(1)) if alt_m else ""
|
|
raw.append({
|
|
"id": fig_id,
|
|
"label": self._extract_title(caption),
|
|
"caption": caption,
|
|
"alt_text": alt_text,
|
|
"source": "image",
|
|
"pos": m.start(),
|
|
})
|
|
|
|
# 3. Code-cell figures: ```{python} with #| label: fig-...
|
|
for m in _CODE_BLOCK_FIG.finditer(content):
|
|
cell_opts_text = m.group(1)
|
|
lbl_m = re.search(r"#\|\s*label:\s*(fig-[\w-]+)", cell_opts_text)
|
|
if not lbl_m:
|
|
continue
|
|
fig_id = lbl_m.group(1)
|
|
cap_m = re.search(r'#\|\s*fig-cap:\s*"([^"]*)"', cell_opts_text)
|
|
alt_m = re.search(r'#\|\s*fig-alt:\s*"([^"]*)"', cell_opts_text)
|
|
caption = cap_m.group(1) if cap_m else ""
|
|
alt_text = alt_m.group(1) if alt_m else ""
|
|
raw.append({
|
|
"id": fig_id,
|
|
"label": self._extract_title(caption),
|
|
"caption": caption,
|
|
"alt_text": alt_text,
|
|
"source": "code-cell",
|
|
"pos": m.start(),
|
|
})
|
|
|
|
# Sort by position in file, then deduplicate by label
|
|
raw.sort(key=lambda x: x["pos"])
|
|
seen: Dict[str, Dict] = {}
|
|
for fig in raw:
|
|
fid = fig["id"]
|
|
if fid in seen:
|
|
# Keep whichever has a richer caption
|
|
if not seen[fid]["caption"] and fig["caption"]:
|
|
seen[fid] = fig
|
|
else:
|
|
seen[fid] = fig
|
|
|
|
return [
|
|
{k: v for k, v in fig.items() if k != "pos"}
|
|
for fig in sorted(seen.values(), key=lambda x: x["pos"])
|
|
]
|
|
|
|
@staticmethod
|
|
def _unescape(s: str) -> str:
|
|
return s.replace('\\"', '"').replace("\\'", "'")
|
|
|
|
@staticmethod
|
|
def _strip_quotes(s: str) -> str:
|
|
s = s.strip()
|
|
if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
|
|
s = s[1:-1]
|
|
return s
|
|
|
|
@staticmethod
|
|
def _extract_title(caption: str) -> str:
|
|
"""Extract the bold title from a caption."""
|
|
if not caption:
|
|
return ""
|
|
m = re.match(r"\*\*([^*]+)\*\*", caption)
|
|
if m:
|
|
return m.group(1).rstrip(":.").strip()
|
|
colon = caption.find(":")
|
|
period = caption.find(".")
|
|
if colon > 0 and (period < 0 or colon < period):
|
|
return caption[:colon].strip()
|
|
if period > 0:
|
|
return caption[:period].strip()
|
|
return caption[:60].strip()
|
|
|
|
# ------------------------------------------------------------------
|
|
# Output formatting
|
|
# ------------------------------------------------------------------
|
|
|
|
def _fig_display_number(self, fig: Dict, with_pdf: bool) -> str:
|
|
"""Return the display number for a figure."""
|
|
if with_pdf and fig.get("fig_number"):
|
|
return fig["fig_number"]
|
|
return str(fig.get("seq", ""))
|
|
|
|
def _format_figures(self, figures: List[Dict], fmt: str, with_pdf: bool = False) -> str:
|
|
if fmt == "csv":
|
|
header = "chapter,fig_number,page,label,id,caption,alt_text,source,file"
|
|
lines = [header]
|
|
for f in figures:
|
|
cap = f["caption"].replace('"', '""')
|
|
alt = f["alt_text"].replace('"', '""')
|
|
lbl = f["label"].replace('"', '""')
|
|
fig_num = f.get("fig_number", "")
|
|
page = f.get("page", "")
|
|
lines.append(
|
|
f'"{f["chapter"]}",{fig_num},{page},"{lbl}",{f["id"]},"{cap}","{alt}",{f["source"]},{f["file"]}'
|
|
)
|
|
return "\n".join(lines) + "\n"
|
|
|
|
if fmt == "markdown":
|
|
title = "# Figure List"
|
|
if with_pdf:
|
|
title += " (with PDF numbers)"
|
|
lines = [f"{title}\n"]
|
|
current_chapter = ""
|
|
for f in figures:
|
|
if f["chapter"] != current_chapter:
|
|
current_chapter = f["chapter"]
|
|
lines.append(f"\n## {current_chapter}\n")
|
|
num = self._fig_display_number(f, with_pdf)
|
|
page = f.get("page", "")
|
|
header = f"**Figure {num}**"
|
|
if with_pdf and page:
|
|
header += f" (Page {page})"
|
|
header += f" (`{f['id']}`)"
|
|
lines.append(header)
|
|
if f["label"]:
|
|
lines.append(f" **Title**: {f['label']}")
|
|
if f["caption"]:
|
|
lines.append(f" **Caption**: {f['caption']}")
|
|
if f["alt_text"]:
|
|
lines.append(f" **Alt-text**: {f['alt_text']}")
|
|
lines.append(f" *Source*: {f['source']} | *File*: {f['file']}")
|
|
lines.append("")
|
|
# Summary
|
|
total = len(figures)
|
|
with_cap = sum(1 for f in figures if f["caption"])
|
|
with_alt = sum(1 for f in figures if f["alt_text"])
|
|
lines.append(f"---\n**Total**: {total} figures | Captions: {with_cap}/{total} | Alt-text: {with_alt}/{total}")
|
|
return "\n".join(lines)
|
|
|
|
# Plain text
|
|
lines = []
|
|
current_chapter = ""
|
|
for f in figures:
|
|
if f["chapter"] != current_chapter:
|
|
current_chapter = f["chapter"]
|
|
lines.append(f"\n{'='*70}")
|
|
lines.append(f" {current_chapter}")
|
|
lines.append(f"{'='*70}")
|
|
num = self._fig_display_number(f, with_pdf)
|
|
page = f.get("page", "")
|
|
if with_pdf and page:
|
|
lines.append(f" Figure {num:>6s} (Page {page:>4s}) {f['id']}")
|
|
else:
|
|
lines.append(f" Fig {num:>3s} {f['id']}")
|
|
if f["label"]:
|
|
lines.append(f" Title: {f['label']}")
|
|
has_cap = "YES" if f["caption"] else "MISSING"
|
|
has_alt = "YES" if f["alt_text"] else "MISSING"
|
|
lines.append(f" Caption: {has_cap} | Alt-text: {has_alt}")
|
|
lines.append("")
|
|
total = len(figures)
|
|
with_cap = sum(1 for f in figures if f["caption"])
|
|
with_alt = sum(1 for f in figures if f["alt_text"])
|
|
lines.append(f"Total: {total} figures | Captions: {with_cap}/{total} | Alt-text: {with_alt}/{total}")
|
|
return "\n".join(lines) + "\n"
|
|
|
|
def _print_figures_rich(self, figures: List[Dict], with_pdf: bool = False) -> None:
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
if with_pdf:
|
|
table.add_column("Fig #", style="bold", width=8, justify="right")
|
|
table.add_column("Page", style="dim", width=5, justify="right")
|
|
else:
|
|
table.add_column("#", style="dim", width=4, justify="right")
|
|
table.add_column("Label", style="cyan", width=30, no_wrap=True)
|
|
table.add_column("Title", style="white", width=30, no_wrap=True)
|
|
table.add_column("Cap", width=4, justify="center")
|
|
table.add_column("Alt", width=4, justify="center")
|
|
table.add_column("Src", style="dim", width=6)
|
|
|
|
current_chapter = ""
|
|
for f in figures:
|
|
if f["chapter"] != current_chapter:
|
|
current_chapter = f["chapter"]
|
|
name = current_chapter[:30] if len(current_chapter) <= 30 else current_chapter[:28] + "…"
|
|
if with_pdf:
|
|
table.add_row("", "", f"[bold yellow]{name}[/bold yellow]", "", "", "", "")
|
|
else:
|
|
table.add_row("", f"[bold yellow]{name}[/bold yellow]", "", "", "", "")
|
|
|
|
title = f["label"][:28] + "…" if len(f["label"]) > 30 else f["label"]
|
|
cap = "[green]✓[/green]" if f["caption"] else "[red]✗[/red]"
|
|
alt = "[green]✓[/green]" if f["alt_text"] else "[red]✗[/red]"
|
|
num = self._fig_display_number(f, with_pdf)
|
|
|
|
if with_pdf:
|
|
page = f.get("page", "")
|
|
table.add_row(num, page, f["id"], title, cap, alt, f["source"])
|
|
else:
|
|
table.add_row(num, f["id"], title, cap, alt, f["source"])
|
|
|
|
total = len(figures)
|
|
with_cap = sum(1 for f in figures if f["caption"])
|
|
with_alt = sum(1 for f in figures if f["alt_text"])
|
|
|
|
panel_title = "Figure List"
|
|
if with_pdf:
|
|
panel_title += " (with PDF numbers)"
|
|
console.print(Panel(table, title=panel_title, border_style="cyan"))
|
|
console.print(
|
|
f"[bold]Total[/bold]: {total} figures | "
|
|
f"Captions: {with_cap}/{total} | "
|
|
f"Alt-text: {with_alt}/{total}"
|
|
)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Concepts (ported from extract_concepts.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_concepts(self, root: Path, ns) -> bool:
|
|
"""Extract key concepts from QMD files."""
|
|
files = self._qmd_files(root)
|
|
chapter_files = [f for f in files if "/parts/" not in str(f) and "/frontmatter/" not in str(f)]
|
|
|
|
if not chapter_files:
|
|
console.print("[yellow]No QMD files found.[/yellow]")
|
|
return False
|
|
|
|
bold_pat = re.compile(r"\*\*([^*]+)\*\*")
|
|
fn_def_pat = re.compile(r"\[\^fn-([^\]]+)\]:\s*(.+?)(?=\n\n|\[\^|\Z)", re.DOTALL)
|
|
def_patterns = [
|
|
re.compile(r"(\w[\w\s]+?)\s+is defined as", re.IGNORECASE),
|
|
re.compile(r"(\w[\w\s]+?)\s+refers to", re.IGNORECASE),
|
|
re.compile(r"We define\s+(\w[\w\s]+?)\s+as", re.IGNORECASE),
|
|
]
|
|
heading_pat = re.compile(r"^(#{1,6})\s+(.*)")
|
|
|
|
for path in chapter_files:
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
continue
|
|
|
|
lines = content.splitlines()
|
|
chapter = self._chapter_name(path)
|
|
|
|
# Extract H2 topics
|
|
h2_topics = []
|
|
for line in lines:
|
|
m = heading_pat.match(line)
|
|
if m and len(m.group(1)) == 2:
|
|
text = re.sub(r"\{#[^}]+\}", "", m.group(2)).strip()
|
|
if not text.startswith("Purpose"):
|
|
h2_topics.append(text)
|
|
|
|
# Bold terms
|
|
bold_terms = set()
|
|
for m in bold_pat.finditer(content):
|
|
term = m.group(1).strip()
|
|
if len(term) > 2 and not term.startswith("Note"):
|
|
bold_terms.add(term)
|
|
|
|
# Definitions
|
|
definitions = set()
|
|
for pat in def_patterns:
|
|
for m in pat.finditer(content):
|
|
term = m.group(1).strip()
|
|
if len(term) < 50:
|
|
definitions.add(term)
|
|
|
|
# Footnotes
|
|
footnotes = []
|
|
for m in fn_def_pat.finditer(content):
|
|
footnotes.append(m.group(1))
|
|
|
|
# Print
|
|
name = chapter[:40] if len(chapter) <= 40 else chapter[:38] + "…"
|
|
console.print(f"\n[bold cyan]{name}[/bold cyan] [dim]({self._relative(path)})[/dim]")
|
|
|
|
if h2_topics:
|
|
console.print(f" [yellow]Topics:[/yellow] {', '.join(h2_topics[:8])}")
|
|
if bold_terms:
|
|
terms = sorted(bold_terms)[:12]
|
|
console.print(f" [yellow]Key terms:[/yellow] {', '.join(terms)}")
|
|
if definitions:
|
|
defs = sorted(definitions)[:6]
|
|
console.print(f" [yellow]Defines:[/yellow] {', '.join(defs)}")
|
|
if footnotes:
|
|
console.print(f" [yellow]Footnotes:[/yellow] {len(footnotes)}")
|
|
|
|
console.print(f"\n[bold]Scanned {len(chapter_files)} files.[/bold]")
|
|
return True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Headers (ported from extract_headers.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_headers(self, root: Path, ns) -> bool:
|
|
"""List all section headers with levels."""
|
|
files = self._qmd_files(root)
|
|
chapter_files = [f for f in files if "/parts/" not in str(f)]
|
|
|
|
if not chapter_files:
|
|
console.print("[yellow]No QMD files found.[/yellow]")
|
|
return False
|
|
|
|
heading_pat = re.compile(r"^(#{1,6})\s+(.*)")
|
|
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("File", style="dim", width=30, no_wrap=True)
|
|
table.add_column("Lvl", style="cyan", width=5, justify="center")
|
|
table.add_column("Header", style="white", width=50)
|
|
|
|
total = 0
|
|
for path in chapter_files:
|
|
try:
|
|
lines = path.read_text(encoding="utf-8").splitlines()
|
|
except Exception:
|
|
continue
|
|
rel = self._relative(path)
|
|
if len(rel) > 28:
|
|
rel = "…" + rel[-27:]
|
|
for line in lines:
|
|
m = heading_pat.match(line)
|
|
if m:
|
|
level = m.group(1)
|
|
text = m.group(2).strip()
|
|
# Indent based on level for visual hierarchy
|
|
indent = " " * (len(level) - 1)
|
|
table.add_row(rel, level, f"{indent}{text}")
|
|
total += 1
|
|
rel = "" # Only show filename on first header
|
|
|
|
console.print(Panel(table, title="Section Headers", border_style="cyan"))
|
|
console.print(f"[bold]Total:[/bold] {total} headers across {len(chapter_files)} files")
|
|
return True
|
|
|
|
# ------------------------------------------------------------------
|
|
# Acronyms (ported from find_acronyms.py)
|
|
# ------------------------------------------------------------------
|
|
|
|
def _run_acronyms(self, root: Path, ns) -> bool:
|
|
"""Find acronyms in parentheses (e.g., (CNN), (GPU))."""
|
|
from collections import Counter
|
|
|
|
files = self._qmd_files(root)
|
|
if not files:
|
|
console.print("[yellow]No QMD files found.[/yellow]")
|
|
return False
|
|
|
|
acronym_pat = re.compile(r"\(([A-Z]{2,}s?)\)")
|
|
acronym_counts: Dict[str, int] = Counter()
|
|
acronym_files: Dict[str, List[str]] = defaultdict(list)
|
|
|
|
for path in files:
|
|
try:
|
|
content = path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
continue
|
|
found = set()
|
|
for m in acronym_pat.finditer(content):
|
|
acr = m.group(1)
|
|
acronym_counts[acr] += 1
|
|
found.add(acr)
|
|
for acr in found:
|
|
acronym_files[acr].append(self._relative(path))
|
|
|
|
if not acronym_counts:
|
|
console.print("[yellow]No acronyms found.[/yellow]")
|
|
return True
|
|
|
|
table = Table(show_header=True, header_style="bold cyan", box=None)
|
|
table.add_column("Acronym", style="cyan", width=12)
|
|
table.add_column("Count", style="white", width=7, justify="right")
|
|
table.add_column("Files", style="dim", width=50)
|
|
|
|
for acr, count in sorted(acronym_counts.items(), key=lambda x: -x[1]):
|
|
file_list = acronym_files[acr]
|
|
files_str = ", ".join(f.split("/")[-1].replace(".qmd", "") for f in file_list[:5])
|
|
if len(file_list) > 5:
|
|
files_str += f" +{len(file_list) - 5} more"
|
|
table.add_row(acr, str(count), files_str)
|
|
|
|
console.print(Panel(table, title="Acronyms Found", border_style="cyan"))
|
|
console.print(f"[bold]Total:[/bold] {len(acronym_counts)} unique acronyms, {sum(acronym_counts.values())} occurrences")
|
|
return True
|