mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 09:38:33 -05:00
582 lines
21 KiB
Python
582 lines
21 KiB
Python
"""
|
|
File and chapter discovery for MLSysBook CLI.
|
|
|
|
Handles finding chapter files, validating paths, and managing file operations.
|
|
Supports volume-aware discovery for vol1 and vol2.
|
|
|
|
Single source of truth for chapter ordering: `get_chapters_from_config()` reads
|
|
the PDF YAML config for a volume and returns the ordered list of testable chapter
|
|
stems. All commands (debug, build, validate, etc.) should call this method rather
|
|
than maintaining their own exclusion lists or filesystem scans.
|
|
"""
|
|
|
|
import re
|
|
import fnmatch
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict, Any
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
# Volume directories
|
|
VOLUME_DIRS = ["vol1", "vol2"]
|
|
|
|
# Shared content directory (sibling to vol1/, vol2/ under contents/)
|
|
SHARED_DIR = "shared"
|
|
|
|
# Chapter stems that cannot be rendered standalone and are always excluded from
|
|
# per-chapter build/debug operations.
|
|
SKIP_STEMS = frozenset({"index", "references"})
|
|
|
|
|
|
def get_chapters_from_config(book_dir: Path, volume: str) -> List[str]:
|
|
"""Return the ordered list of buildable file stems from the PDF config.
|
|
|
|
Reads ``book/config/_quarto-pdf-{volume}.yml`` and extracts every entry
|
|
under ``book.chapters`` — including frontmatter, parts pages, and shared
|
|
files. Appendices are excluded. Only ``index.qmd`` and ``references.qmd``
|
|
are skipped, as they cannot be rendered standalone.
|
|
|
|
Args:
|
|
book_dir: Path to the ``book/quarto`` directory.
|
|
volume: ``"vol1"`` or ``"vol2"``.
|
|
|
|
Returns:
|
|
Ordered list of file stems in YAML order (e.g. ``["dedication",
|
|
"introduction", "distributed_training", ...]``). Empty list if the
|
|
config is missing or cannot be parsed.
|
|
"""
|
|
config_file = book_dir / "config" / f"_quarto-pdf-{volume}.yml"
|
|
if not config_file.exists():
|
|
return []
|
|
|
|
def _is_testable(path_str: str) -> bool:
|
|
return Path(path_str).stem not in SKIP_STEMS
|
|
|
|
# --- YAML-aware path (preferred) ---
|
|
try:
|
|
import yaml # type: ignore
|
|
|
|
raw = yaml.safe_load(config_file.read_text())
|
|
chapter_entries = raw.get("book", {}).get("chapters", [])
|
|
|
|
chapters: List[str] = []
|
|
seen: set = set()
|
|
for entry in chapter_entries:
|
|
if isinstance(entry, str):
|
|
path = entry
|
|
elif isinstance(entry, dict):
|
|
path = entry.get("file", "")
|
|
else:
|
|
continue
|
|
if not path or not _is_testable(path):
|
|
continue
|
|
stem = Path(path).stem
|
|
if stem and stem not in seen:
|
|
seen.add(stem)
|
|
chapters.append(stem)
|
|
return chapters
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
# --- Regex fallback (no PyYAML) ---
|
|
content = config_file.read_text()
|
|
|
|
# Isolate the chapters: block (stop before appendices:)
|
|
chapters_block_match = re.search(
|
|
r'^\s{2}chapters:\s*\n(.*?)(?=^\s{2}\w|\Z)',
|
|
content,
|
|
re.MULTILINE | re.DOTALL,
|
|
)
|
|
block = chapters_block_match.group(1) if chapters_block_match else content
|
|
|
|
chapters = []
|
|
seen = set()
|
|
for m in re.finditer(r'\s*-\s*(contents/[^\s#]+\.qmd)', block):
|
|
path_str = m.group(1)
|
|
if not _is_testable(path_str):
|
|
continue
|
|
stem = Path(path_str).stem
|
|
if stem not in seen:
|
|
seen.add(stem)
|
|
chapters.append(stem)
|
|
|
|
return chapters
|
|
|
|
|
|
class AmbiguousChapterError(Exception):
|
|
"""Raised when a chapter name exists in multiple volumes."""
|
|
|
|
def __init__(self, chapter_name: str, locations: List[str]):
|
|
self.chapter_name = chapter_name
|
|
self.locations = locations
|
|
super().__init__(
|
|
f"'{chapter_name}' exists in multiple volumes: {', '.join(locations)}"
|
|
)
|
|
|
|
|
|
class ChapterDiscovery:
|
|
"""Discovers and manages chapter files in the MLSysBook project."""
|
|
|
|
def __init__(self, book_dir: Path):
|
|
"""Initialize chapter discovery.
|
|
|
|
Args:
|
|
book_dir: Path to the book directory (usually 'quarto')
|
|
"""
|
|
self.book_dir = Path(book_dir)
|
|
self.contents_dir = self.book_dir / "contents"
|
|
|
|
def get_chapters_from_config(self, volume: str) -> List[str]:
|
|
"""Return the ordered list of testable chapter stems for a volume.
|
|
|
|
Delegates to the module-level ``get_chapters_from_config`` function so
|
|
that all CLI commands share a single implementation. Call this instead
|
|
of ``get_volume_chapters`` whenever the canonical build order matters.
|
|
|
|
Args:
|
|
volume: ``"vol1"`` or ``"vol2"``.
|
|
|
|
Returns:
|
|
Ordered list of chapter stems from the PDF config (e.g.
|
|
``["introduction", "distributed_training", ...]``).
|
|
"""
|
|
return get_chapters_from_config(self.book_dir, volume)
|
|
|
|
def _get_volume_from_path(self, path: Path) -> Optional[str]:
|
|
"""Extract volume (vol1/vol2) from a file path.
|
|
|
|
Args:
|
|
path: Path to check
|
|
|
|
Returns:
|
|
'vol1', 'vol2', or None if not in a volume directory
|
|
"""
|
|
try:
|
|
rel_path = path.relative_to(self.contents_dir)
|
|
parts = rel_path.parts
|
|
if parts and parts[0] in VOLUME_DIRS:
|
|
return parts[0]
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
def _parse_chapter_spec(self, chapter_spec: str) -> tuple[Optional[str], str]:
|
|
"""Parse a chapter specification that may include volume prefix.
|
|
|
|
Args:
|
|
chapter_spec: Chapter name, optionally with volume prefix (e.g., 'vol1/intro')
|
|
|
|
Returns:
|
|
Tuple of (volume, chapter_name) where volume may be None
|
|
"""
|
|
if "/" in chapter_spec:
|
|
parts = chapter_spec.split("/", 1)
|
|
if parts[0] in VOLUME_DIRS:
|
|
return parts[0], parts[1]
|
|
return None, chapter_spec
|
|
|
|
@staticmethod
|
|
def _match_score(query: str, candidate: str) -> int:
|
|
"""Score how well a query matches a candidate chapter name.
|
|
|
|
Higher score = better match. Uses longest common substring length
|
|
as primary metric, with shorter candidate names preferred as tiebreaker.
|
|
|
|
Args:
|
|
query: The search term (e.g., 'dnn_')
|
|
candidate: The chapter file stem (e.g., 'nn_architectures')
|
|
|
|
Returns:
|
|
Match score (higher is better), 0 if no match
|
|
"""
|
|
q = query.lower()
|
|
c = candidate.lower()
|
|
|
|
# Exact match gets highest score
|
|
if q == c:
|
|
return 10000
|
|
|
|
# Starts-with match gets high score, weighted by coverage
|
|
if c.startswith(q):
|
|
return 5000 + int(1000 * len(q) / len(c))
|
|
|
|
# Contains match gets medium score, weighted by coverage
|
|
if q in c:
|
|
return 2000 + int(1000 * len(q) / len(c))
|
|
|
|
# Partial overlap: find longest common substring
|
|
best = 0
|
|
for i in range(len(q)):
|
|
for j in range(i + 1, len(q) + 1):
|
|
sub = q[i:j]
|
|
if sub in c and len(sub) > best:
|
|
best = len(sub)
|
|
if best >= 2:
|
|
return 500 + int(1000 * best / len(c))
|
|
|
|
return 0
|
|
|
|
def find_chapter_file(self, chapter_spec: str, allow_fuzzy: bool = False) -> Optional[Path]:
|
|
"""Find a chapter file by name, using best-match scoring.
|
|
|
|
Supports volume-prefixed names (e.g., 'vol1/intro') for disambiguation.
|
|
Raises AmbiguousChapterError if chapter exists in multiple volumes
|
|
without a volume prefix.
|
|
|
|
Matching strategy (in order of priority):
|
|
1. Exact stem match (e.g., 'nn_computation' → nn_computation.qmd)
|
|
2. Best fuzzy match scored by: starts-with > contains > partial overlap,
|
|
with higher coverage (query length / candidate length) preferred.
|
|
|
|
Args:
|
|
chapter_spec: Chapter name to search for, optionally with volume prefix
|
|
allow_fuzzy: If True, allow fuzzy fallback for non-exact matches.
|
|
|
|
Returns:
|
|
Path to the chapter file if found, None otherwise
|
|
|
|
Raises:
|
|
AmbiguousChapterError: If chapter exists in multiple volumes without prefix
|
|
"""
|
|
if not self.contents_dir.exists():
|
|
console.print(f"[red]Contents directory not found: {self.contents_dir}[/red]")
|
|
return None
|
|
|
|
# Parse volume prefix if present
|
|
volume_filter, chapter_name = self._parse_chapter_spec(chapter_spec)
|
|
|
|
# Determine search directory
|
|
if volume_filter:
|
|
search_dir = self.contents_dir / volume_filter
|
|
if not search_dir.exists():
|
|
console.print(f"[red]Volume directory not found: {search_dir}[/red]")
|
|
return None
|
|
else:
|
|
search_dir = self.contents_dir
|
|
|
|
# Try exact match first
|
|
exact_matches = list(search_dir.rglob(f"{chapter_name}.qmd"))
|
|
|
|
# When a volume prefix was given, also search the shared directory so that
|
|
# files like contents/shared/notation.qmd are resolvable as "vol1/notation".
|
|
if volume_filter:
|
|
shared_dir = self.contents_dir / SHARED_DIR
|
|
if shared_dir.exists():
|
|
exact_matches += list(shared_dir.rglob(f"{chapter_name}.qmd"))
|
|
|
|
# Filter to actual chapter files (in volume directories, not frontmatter/backmatter)
|
|
chapter_matches = []
|
|
for match in exact_matches:
|
|
vol = self._get_volume_from_path(match)
|
|
is_shared = SHARED_DIR in match.relative_to(self.contents_dir).parts
|
|
if vol or volume_filter or is_shared:
|
|
chapter_matches.append(match)
|
|
|
|
if not chapter_matches and allow_fuzzy:
|
|
# No exact match — score all .qmd files and pick the best
|
|
all_qmd_files = list(search_dir.rglob("*.qmd"))
|
|
|
|
scored = []
|
|
for match in all_qmd_files:
|
|
vol = self._get_volume_from_path(match)
|
|
if not (vol or volume_filter):
|
|
continue
|
|
score = self._match_score(chapter_name, match.stem)
|
|
if score > 0:
|
|
scored.append((score, match))
|
|
|
|
if scored:
|
|
# Sort by score descending
|
|
scored.sort(key=lambda x: x[0], reverse=True)
|
|
best_score = scored[0][0]
|
|
# Reject weak fuzzy matches to avoid incorrect chapter resolution.
|
|
if best_score < 2000:
|
|
return None
|
|
# Collect all matches with the same best score
|
|
chapter_matches = [m for s, m in scored if s == best_score]
|
|
|
|
if not chapter_matches:
|
|
return None
|
|
|
|
if len(chapter_matches) == 1:
|
|
return chapter_matches[0]
|
|
|
|
# Multiple matches - check if they're in different volumes (ambiguous)
|
|
if not volume_filter:
|
|
volumes_found = {}
|
|
for match in chapter_matches:
|
|
vol = self._get_volume_from_path(match)
|
|
if vol:
|
|
if vol not in volumes_found:
|
|
volumes_found[vol] = match
|
|
|
|
if len(volumes_found) > 1:
|
|
# Ambiguous only when the matched chapter stem is actually the same
|
|
# in multiple volumes (e.g., vol1/introduction and vol2/introduction).
|
|
stems = {m.stem for m in volumes_found.values()}
|
|
if len(stems) == 1:
|
|
stem = next(iter(stems))
|
|
locations = [f"{vol}/{stem}" for vol in sorted(volumes_found.keys())]
|
|
raise AmbiguousChapterError(stem, locations)
|
|
# Tied fuzzy matches with different stems are not reliable.
|
|
return None
|
|
|
|
# Return the first match
|
|
return chapter_matches[0]
|
|
|
|
def get_all_chapters(self, volume: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
"""Get all chapter files with metadata.
|
|
|
|
Args:
|
|
volume: Optional volume filter ('vol1', 'vol2', or None for all)
|
|
|
|
Returns:
|
|
List of dictionaries containing chapter information
|
|
"""
|
|
chapters = []
|
|
|
|
if not self.contents_dir.exists():
|
|
return chapters
|
|
|
|
# Determine search directory
|
|
if volume:
|
|
if volume not in VOLUME_DIRS:
|
|
console.print(f"[red]Invalid volume: {volume}. Use 'vol1' or 'vol2'[/red]")
|
|
return chapters
|
|
search_dir = self.contents_dir / volume
|
|
else:
|
|
search_dir = self.contents_dir
|
|
|
|
for qmd_file in search_dir.rglob("*.qmd"):
|
|
# Skip certain files
|
|
if qmd_file.name in ["index.qmd", "404.qmd"]:
|
|
continue
|
|
|
|
# Get relative path from contents directory
|
|
rel_path = qmd_file.relative_to(self.contents_dir)
|
|
|
|
# Determine volume
|
|
vol = self._get_volume_from_path(qmd_file)
|
|
|
|
# Skip non-volume files (frontmatter, backmatter) unless searching all
|
|
if not volume and not vol:
|
|
continue
|
|
|
|
# Extract chapter info
|
|
chapter_info = {
|
|
"name": qmd_file.stem,
|
|
"path": qmd_file,
|
|
"relative_path": rel_path,
|
|
"directory": qmd_file.parent.name,
|
|
"volume": vol,
|
|
"size": qmd_file.stat().st_size if qmd_file.exists() else 0
|
|
}
|
|
|
|
chapters.append(chapter_info)
|
|
|
|
# Sort by path for consistent ordering
|
|
chapters.sort(key=lambda x: str(x["relative_path"]))
|
|
return chapters
|
|
|
|
def get_volume_chapters(self, volume: str) -> List[Path]:
|
|
"""Get all chapter file paths for a specific volume.
|
|
|
|
Args:
|
|
volume: Volume to get chapters for ('vol1' or 'vol2')
|
|
|
|
Returns:
|
|
List of chapter file paths
|
|
"""
|
|
chapters = self.get_all_chapters(volume=volume)
|
|
return [ch["path"] for ch in chapters]
|
|
|
|
def show_chapters(self, volume: Optional[str] = None) -> None:
|
|
"""Display available chapters in a formatted table.
|
|
|
|
Args:
|
|
volume: Optional volume filter ('vol1', 'vol2', or None for all)
|
|
"""
|
|
from rich.table import Table
|
|
|
|
chapters = self.get_all_chapters(volume=volume)
|
|
|
|
if not chapters:
|
|
console.print("[yellow]No chapters found[/yellow]")
|
|
return
|
|
|
|
table = Table(show_header=True, header_style="bold blue")
|
|
table.add_column("Chapter", style="green", width=25)
|
|
table.add_column("Volume", style="magenta", width=8)
|
|
table.add_column("Directory", style="cyan", width=20)
|
|
table.add_column("Size", style="dim", width=10)
|
|
|
|
for chapter in chapters:
|
|
size_kb = chapter["size"] / 1024 if chapter["size"] > 0 else 0
|
|
size_str = f"{size_kb:.1f} KB" if size_kb > 0 else "0 KB"
|
|
|
|
table.add_row(
|
|
chapter["name"],
|
|
chapter["volume"] or "-",
|
|
chapter["directory"],
|
|
size_str
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
# Show volume summary
|
|
vol1_count = sum(1 for ch in chapters if ch["volume"] == "vol1")
|
|
vol2_count = sum(1 for ch in chapters if ch["volume"] == "vol2")
|
|
|
|
if volume:
|
|
console.print(f"\n[dim]Found {len(chapters)} chapters in {volume}[/dim]")
|
|
else:
|
|
console.print(f"\n[dim]Found {len(chapters)} chapters (vol1: {vol1_count}, vol2: {vol2_count})[/dim]")
|
|
|
|
def validate_chapters(self, chapter_names: List[str]) -> List[Path]:
|
|
"""Validate a list of chapter names and return their paths.
|
|
|
|
Args:
|
|
chapter_names: List of chapter names to validate
|
|
|
|
Returns:
|
|
List of valid chapter file paths
|
|
|
|
Raises:
|
|
FileNotFoundError: If any chapter is not found
|
|
AmbiguousChapterError: If chapter exists in multiple volumes
|
|
"""
|
|
chapter_files = []
|
|
|
|
for chapter_name in chapter_names:
|
|
try:
|
|
chapter_file = self.find_chapter_file(chapter_name, allow_fuzzy=True)
|
|
except AmbiguousChapterError as e:
|
|
console.print(f"[red]Ambiguous chapter: '{e.chapter_name}' exists in multiple volumes[/red]")
|
|
console.print("[yellow]Please specify the volume:[/yellow]")
|
|
for loc in e.locations:
|
|
console.print(f" - {loc}")
|
|
raise
|
|
|
|
if not chapter_file:
|
|
available_chapters = [ch["name"] for ch in self.get_all_chapters()]
|
|
console.print(f"[red]Chapter not found: {chapter_name}[/red]")
|
|
console.print("[yellow]Available chapters:[/yellow]")
|
|
for ch in available_chapters[:10]: # Show first 10
|
|
console.print(f" - {ch}")
|
|
if len(available_chapters) > 10:
|
|
console.print(f" ... and {len(available_chapters) - 10} more")
|
|
raise FileNotFoundError(f"Chapter not found: {chapter_name}")
|
|
|
|
chapter_files.append(chapter_file)
|
|
|
|
return chapter_files
|
|
|
|
def expand_chapter_patterns(
|
|
self,
|
|
chapter_specs: List[str],
|
|
*,
|
|
volume: Optional[str] = None,
|
|
) -> List[str]:
|
|
"""
|
|
Expand glob/regex chapter patterns into concrete chapter specs.
|
|
|
|
Supported pattern forms:
|
|
- **Glob** (default): `appendix*`, `*principles`, `vol1/appendix_*`
|
|
- **Regex**: prefix with `re:` (matched with `re.search`), e.g. `re:^appendix_`
|
|
|
|
Notes:
|
|
- If a token has no wildcard/meta and doesn't start with `re:`, it is returned unchanged.
|
|
- If a pattern matches nothing, it is returned unchanged (so existing fuzzy matching
|
|
behavior remains available); callers may still fail later during validation.
|
|
- Order is preserved; duplicates are removed.
|
|
"""
|
|
# Candidate names come from discovery; includes front/backmatter too (useful for appendix*).
|
|
all_candidates = [ch["name"] for ch in self.get_all_chapters(volume=volume)]
|
|
|
|
expanded: List[str] = []
|
|
seen = set()
|
|
|
|
def _append(spec: str) -> None:
|
|
if spec not in seen:
|
|
expanded.append(spec)
|
|
seen.add(spec)
|
|
|
|
for spec in chapter_specs:
|
|
spec = spec.strip()
|
|
if not spec:
|
|
continue
|
|
|
|
spec_volume, name_or_pat = self._parse_chapter_spec(spec)
|
|
local_volume = spec_volume or volume
|
|
|
|
candidates = (
|
|
[ch["name"] for ch in self.get_all_chapters(volume=local_volume)]
|
|
if local_volume
|
|
else all_candidates
|
|
)
|
|
|
|
is_regex = name_or_pat.startswith("re:")
|
|
is_glob = any(ch in name_or_pat for ch in ["*", "?", "["])
|
|
|
|
matches: List[str] = []
|
|
if is_regex:
|
|
pat = name_or_pat[len("re:") :]
|
|
try:
|
|
rx = re.compile(pat)
|
|
matches = [c for c in candidates if rx.search(c)]
|
|
except re.error:
|
|
matches = []
|
|
elif is_glob:
|
|
matches = [c for c in candidates if fnmatch.fnmatchcase(c, name_or_pat)]
|
|
|
|
if matches:
|
|
for m in matches:
|
|
_append(f"{local_volume}/{m}" if spec_volume else m)
|
|
else:
|
|
# Not a pattern, or didn't match: keep original token for existing behavior.
|
|
_append(spec)
|
|
|
|
return expanded
|
|
|
|
def get_chapter_dependencies(self, chapter_file: Path) -> List[Path]:
|
|
"""Get dependencies for a chapter (images, includes, etc.).
|
|
|
|
Args:
|
|
chapter_file: Path to the chapter file
|
|
|
|
Returns:
|
|
List of dependency file paths
|
|
"""
|
|
dependencies = []
|
|
|
|
if not chapter_file.exists():
|
|
return dependencies
|
|
|
|
try:
|
|
content = chapter_file.read_text(encoding='utf-8')
|
|
|
|
# Find image references
|
|
image_pattern = r'!\[.*?\]\((.*?)\)'
|
|
for match in re.finditer(image_pattern, content):
|
|
image_path = match.group(1)
|
|
if not image_path.startswith('http'):
|
|
# Resolve relative to chapter file
|
|
full_path = (chapter_file.parent / image_path).resolve()
|
|
if full_path.exists():
|
|
dependencies.append(full_path)
|
|
|
|
# Find include references
|
|
include_pattern = r'{{< include (.*?) >}}'
|
|
for match in re.finditer(include_pattern, content):
|
|
include_path = match.group(1)
|
|
full_path = (chapter_file.parent / include_path).resolve()
|
|
if full_path.exists():
|
|
dependencies.append(full_path)
|
|
|
|
except Exception as e:
|
|
console.print(f"[yellow]⚠️ Error reading chapter dependencies: {e}[/yellow]")
|
|
|
|
return dependencies
|