Files
cs249r_book/book/cli/core/discovery.py
Vijay Janapa Reddi 69736d3bdb updates
2026-02-28 18:20:47 -05:00

582 lines
21 KiB
Python

"""
File and chapter discovery for MLSysBook CLI.
Handles finding chapter files, validating paths, and managing file operations.
Supports volume-aware discovery for vol1 and vol2.
Single source of truth for chapter ordering: `get_chapters_from_config()` reads
the PDF YAML config for a volume and returns the ordered list of testable chapter
stems. All commands (debug, build, validate, etc.) should call this method rather
than maintaining their own exclusion lists or filesystem scans.
"""
import re
import fnmatch
from pathlib import Path
from typing import List, Optional, Dict, Any
from rich.console import Console
console = Console()
# Volume directories
VOLUME_DIRS = ["vol1", "vol2"]
# Shared content directory (sibling to vol1/, vol2/ under contents/)
SHARED_DIR = "shared"
# Chapter stems that cannot be rendered standalone and are always excluded from
# per-chapter build/debug operations.
SKIP_STEMS = frozenset({"index", "references"})
def get_chapters_from_config(book_dir: Path, volume: str) -> List[str]:
"""Return the ordered list of buildable file stems from the PDF config.
Reads ``book/config/_quarto-pdf-{volume}.yml`` and extracts every entry
under ``book.chapters`` — including frontmatter, parts pages, and shared
files. Appendices are excluded. Only ``index.qmd`` and ``references.qmd``
are skipped, as they cannot be rendered standalone.
Args:
book_dir: Path to the ``book/quarto`` directory.
volume: ``"vol1"`` or ``"vol2"``.
Returns:
Ordered list of file stems in YAML order (e.g. ``["dedication",
"introduction", "distributed_training", ...]``). Empty list if the
config is missing or cannot be parsed.
"""
config_file = book_dir / "config" / f"_quarto-pdf-{volume}.yml"
if not config_file.exists():
return []
def _is_testable(path_str: str) -> bool:
return Path(path_str).stem not in SKIP_STEMS
# --- YAML-aware path (preferred) ---
try:
import yaml # type: ignore
raw = yaml.safe_load(config_file.read_text())
chapter_entries = raw.get("book", {}).get("chapters", [])
chapters: List[str] = []
seen: set = set()
for entry in chapter_entries:
if isinstance(entry, str):
path = entry
elif isinstance(entry, dict):
path = entry.get("file", "")
else:
continue
if not path or not _is_testable(path):
continue
stem = Path(path).stem
if stem and stem not in seen:
seen.add(stem)
chapters.append(stem)
return chapters
except Exception:
pass
# --- Regex fallback (no PyYAML) ---
content = config_file.read_text()
# Isolate the chapters: block (stop before appendices:)
chapters_block_match = re.search(
r'^\s{2}chapters:\s*\n(.*?)(?=^\s{2}\w|\Z)',
content,
re.MULTILINE | re.DOTALL,
)
block = chapters_block_match.group(1) if chapters_block_match else content
chapters = []
seen = set()
for m in re.finditer(r'\s*-\s*(contents/[^\s#]+\.qmd)', block):
path_str = m.group(1)
if not _is_testable(path_str):
continue
stem = Path(path_str).stem
if stem not in seen:
seen.add(stem)
chapters.append(stem)
return chapters
class AmbiguousChapterError(Exception):
"""Raised when a chapter name exists in multiple volumes."""
def __init__(self, chapter_name: str, locations: List[str]):
self.chapter_name = chapter_name
self.locations = locations
super().__init__(
f"'{chapter_name}' exists in multiple volumes: {', '.join(locations)}"
)
class ChapterDiscovery:
"""Discovers and manages chapter files in the MLSysBook project."""
def __init__(self, book_dir: Path):
"""Initialize chapter discovery.
Args:
book_dir: Path to the book directory (usually 'quarto')
"""
self.book_dir = Path(book_dir)
self.contents_dir = self.book_dir / "contents"
def get_chapters_from_config(self, volume: str) -> List[str]:
"""Return the ordered list of testable chapter stems for a volume.
Delegates to the module-level ``get_chapters_from_config`` function so
that all CLI commands share a single implementation. Call this instead
of ``get_volume_chapters`` whenever the canonical build order matters.
Args:
volume: ``"vol1"`` or ``"vol2"``.
Returns:
Ordered list of chapter stems from the PDF config (e.g.
``["introduction", "distributed_training", ...]``).
"""
return get_chapters_from_config(self.book_dir, volume)
def _get_volume_from_path(self, path: Path) -> Optional[str]:
"""Extract volume (vol1/vol2) from a file path.
Args:
path: Path to check
Returns:
'vol1', 'vol2', or None if not in a volume directory
"""
try:
rel_path = path.relative_to(self.contents_dir)
parts = rel_path.parts
if parts and parts[0] in VOLUME_DIRS:
return parts[0]
except ValueError:
pass
return None
def _parse_chapter_spec(self, chapter_spec: str) -> tuple[Optional[str], str]:
"""Parse a chapter specification that may include volume prefix.
Args:
chapter_spec: Chapter name, optionally with volume prefix (e.g., 'vol1/intro')
Returns:
Tuple of (volume, chapter_name) where volume may be None
"""
if "/" in chapter_spec:
parts = chapter_spec.split("/", 1)
if parts[0] in VOLUME_DIRS:
return parts[0], parts[1]
return None, chapter_spec
@staticmethod
def _match_score(query: str, candidate: str) -> int:
"""Score how well a query matches a candidate chapter name.
Higher score = better match. Uses longest common substring length
as primary metric, with shorter candidate names preferred as tiebreaker.
Args:
query: The search term (e.g., 'dnn_')
candidate: The chapter file stem (e.g., 'nn_architectures')
Returns:
Match score (higher is better), 0 if no match
"""
q = query.lower()
c = candidate.lower()
# Exact match gets highest score
if q == c:
return 10000
# Starts-with match gets high score, weighted by coverage
if c.startswith(q):
return 5000 + int(1000 * len(q) / len(c))
# Contains match gets medium score, weighted by coverage
if q in c:
return 2000 + int(1000 * len(q) / len(c))
# Partial overlap: find longest common substring
best = 0
for i in range(len(q)):
for j in range(i + 1, len(q) + 1):
sub = q[i:j]
if sub in c and len(sub) > best:
best = len(sub)
if best >= 2:
return 500 + int(1000 * best / len(c))
return 0
def find_chapter_file(self, chapter_spec: str, allow_fuzzy: bool = False) -> Optional[Path]:
"""Find a chapter file by name, using best-match scoring.
Supports volume-prefixed names (e.g., 'vol1/intro') for disambiguation.
Raises AmbiguousChapterError if chapter exists in multiple volumes
without a volume prefix.
Matching strategy (in order of priority):
1. Exact stem match (e.g., 'nn_computation' → nn_computation.qmd)
2. Best fuzzy match scored by: starts-with > contains > partial overlap,
with higher coverage (query length / candidate length) preferred.
Args:
chapter_spec: Chapter name to search for, optionally with volume prefix
allow_fuzzy: If True, allow fuzzy fallback for non-exact matches.
Returns:
Path to the chapter file if found, None otherwise
Raises:
AmbiguousChapterError: If chapter exists in multiple volumes without prefix
"""
if not self.contents_dir.exists():
console.print(f"[red]Contents directory not found: {self.contents_dir}[/red]")
return None
# Parse volume prefix if present
volume_filter, chapter_name = self._parse_chapter_spec(chapter_spec)
# Determine search directory
if volume_filter:
search_dir = self.contents_dir / volume_filter
if not search_dir.exists():
console.print(f"[red]Volume directory not found: {search_dir}[/red]")
return None
else:
search_dir = self.contents_dir
# Try exact match first
exact_matches = list(search_dir.rglob(f"{chapter_name}.qmd"))
# When a volume prefix was given, also search the shared directory so that
# files like contents/shared/notation.qmd are resolvable as "vol1/notation".
if volume_filter:
shared_dir = self.contents_dir / SHARED_DIR
if shared_dir.exists():
exact_matches += list(shared_dir.rglob(f"{chapter_name}.qmd"))
# Filter to actual chapter files (in volume directories, not frontmatter/backmatter)
chapter_matches = []
for match in exact_matches:
vol = self._get_volume_from_path(match)
is_shared = SHARED_DIR in match.relative_to(self.contents_dir).parts
if vol or volume_filter or is_shared:
chapter_matches.append(match)
if not chapter_matches and allow_fuzzy:
# No exact match — score all .qmd files and pick the best
all_qmd_files = list(search_dir.rglob("*.qmd"))
scored = []
for match in all_qmd_files:
vol = self._get_volume_from_path(match)
if not (vol or volume_filter):
continue
score = self._match_score(chapter_name, match.stem)
if score > 0:
scored.append((score, match))
if scored:
# Sort by score descending
scored.sort(key=lambda x: x[0], reverse=True)
best_score = scored[0][0]
# Reject weak fuzzy matches to avoid incorrect chapter resolution.
if best_score < 2000:
return None
# Collect all matches with the same best score
chapter_matches = [m for s, m in scored if s == best_score]
if not chapter_matches:
return None
if len(chapter_matches) == 1:
return chapter_matches[0]
# Multiple matches - check if they're in different volumes (ambiguous)
if not volume_filter:
volumes_found = {}
for match in chapter_matches:
vol = self._get_volume_from_path(match)
if vol:
if vol not in volumes_found:
volumes_found[vol] = match
if len(volumes_found) > 1:
# Ambiguous only when the matched chapter stem is actually the same
# in multiple volumes (e.g., vol1/introduction and vol2/introduction).
stems = {m.stem for m in volumes_found.values()}
if len(stems) == 1:
stem = next(iter(stems))
locations = [f"{vol}/{stem}" for vol in sorted(volumes_found.keys())]
raise AmbiguousChapterError(stem, locations)
# Tied fuzzy matches with different stems are not reliable.
return None
# Return the first match
return chapter_matches[0]
def get_all_chapters(self, volume: Optional[str] = None) -> List[Dict[str, Any]]:
"""Get all chapter files with metadata.
Args:
volume: Optional volume filter ('vol1', 'vol2', or None for all)
Returns:
List of dictionaries containing chapter information
"""
chapters = []
if not self.contents_dir.exists():
return chapters
# Determine search directory
if volume:
if volume not in VOLUME_DIRS:
console.print(f"[red]Invalid volume: {volume}. Use 'vol1' or 'vol2'[/red]")
return chapters
search_dir = self.contents_dir / volume
else:
search_dir = self.contents_dir
for qmd_file in search_dir.rglob("*.qmd"):
# Skip certain files
if qmd_file.name in ["index.qmd", "404.qmd"]:
continue
# Get relative path from contents directory
rel_path = qmd_file.relative_to(self.contents_dir)
# Determine volume
vol = self._get_volume_from_path(qmd_file)
# Skip non-volume files (frontmatter, backmatter) unless searching all
if not volume and not vol:
continue
# Extract chapter info
chapter_info = {
"name": qmd_file.stem,
"path": qmd_file,
"relative_path": rel_path,
"directory": qmd_file.parent.name,
"volume": vol,
"size": qmd_file.stat().st_size if qmd_file.exists() else 0
}
chapters.append(chapter_info)
# Sort by path for consistent ordering
chapters.sort(key=lambda x: str(x["relative_path"]))
return chapters
def get_volume_chapters(self, volume: str) -> List[Path]:
"""Get all chapter file paths for a specific volume.
Args:
volume: Volume to get chapters for ('vol1' or 'vol2')
Returns:
List of chapter file paths
"""
chapters = self.get_all_chapters(volume=volume)
return [ch["path"] for ch in chapters]
def show_chapters(self, volume: Optional[str] = None) -> None:
"""Display available chapters in a formatted table.
Args:
volume: Optional volume filter ('vol1', 'vol2', or None for all)
"""
from rich.table import Table
chapters = self.get_all_chapters(volume=volume)
if not chapters:
console.print("[yellow]No chapters found[/yellow]")
return
table = Table(show_header=True, header_style="bold blue")
table.add_column("Chapter", style="green", width=25)
table.add_column("Volume", style="magenta", width=8)
table.add_column("Directory", style="cyan", width=20)
table.add_column("Size", style="dim", width=10)
for chapter in chapters:
size_kb = chapter["size"] / 1024 if chapter["size"] > 0 else 0
size_str = f"{size_kb:.1f} KB" if size_kb > 0 else "0 KB"
table.add_row(
chapter["name"],
chapter["volume"] or "-",
chapter["directory"],
size_str
)
console.print(table)
# Show volume summary
vol1_count = sum(1 for ch in chapters if ch["volume"] == "vol1")
vol2_count = sum(1 for ch in chapters if ch["volume"] == "vol2")
if volume:
console.print(f"\n[dim]Found {len(chapters)} chapters in {volume}[/dim]")
else:
console.print(f"\n[dim]Found {len(chapters)} chapters (vol1: {vol1_count}, vol2: {vol2_count})[/dim]")
def validate_chapters(self, chapter_names: List[str]) -> List[Path]:
"""Validate a list of chapter names and return their paths.
Args:
chapter_names: List of chapter names to validate
Returns:
List of valid chapter file paths
Raises:
FileNotFoundError: If any chapter is not found
AmbiguousChapterError: If chapter exists in multiple volumes
"""
chapter_files = []
for chapter_name in chapter_names:
try:
chapter_file = self.find_chapter_file(chapter_name, allow_fuzzy=True)
except AmbiguousChapterError as e:
console.print(f"[red]Ambiguous chapter: '{e.chapter_name}' exists in multiple volumes[/red]")
console.print("[yellow]Please specify the volume:[/yellow]")
for loc in e.locations:
console.print(f" - {loc}")
raise
if not chapter_file:
available_chapters = [ch["name"] for ch in self.get_all_chapters()]
console.print(f"[red]Chapter not found: {chapter_name}[/red]")
console.print("[yellow]Available chapters:[/yellow]")
for ch in available_chapters[:10]: # Show first 10
console.print(f" - {ch}")
if len(available_chapters) > 10:
console.print(f" ... and {len(available_chapters) - 10} more")
raise FileNotFoundError(f"Chapter not found: {chapter_name}")
chapter_files.append(chapter_file)
return chapter_files
def expand_chapter_patterns(
self,
chapter_specs: List[str],
*,
volume: Optional[str] = None,
) -> List[str]:
"""
Expand glob/regex chapter patterns into concrete chapter specs.
Supported pattern forms:
- **Glob** (default): `appendix*`, `*principles`, `vol1/appendix_*`
- **Regex**: prefix with `re:` (matched with `re.search`), e.g. `re:^appendix_`
Notes:
- If a token has no wildcard/meta and doesn't start with `re:`, it is returned unchanged.
- If a pattern matches nothing, it is returned unchanged (so existing fuzzy matching
behavior remains available); callers may still fail later during validation.
- Order is preserved; duplicates are removed.
"""
# Candidate names come from discovery; includes front/backmatter too (useful for appendix*).
all_candidates = [ch["name"] for ch in self.get_all_chapters(volume=volume)]
expanded: List[str] = []
seen = set()
def _append(spec: str) -> None:
if spec not in seen:
expanded.append(spec)
seen.add(spec)
for spec in chapter_specs:
spec = spec.strip()
if not spec:
continue
spec_volume, name_or_pat = self._parse_chapter_spec(spec)
local_volume = spec_volume or volume
candidates = (
[ch["name"] for ch in self.get_all_chapters(volume=local_volume)]
if local_volume
else all_candidates
)
is_regex = name_or_pat.startswith("re:")
is_glob = any(ch in name_or_pat for ch in ["*", "?", "["])
matches: List[str] = []
if is_regex:
pat = name_or_pat[len("re:") :]
try:
rx = re.compile(pat)
matches = [c for c in candidates if rx.search(c)]
except re.error:
matches = []
elif is_glob:
matches = [c for c in candidates if fnmatch.fnmatchcase(c, name_or_pat)]
if matches:
for m in matches:
_append(f"{local_volume}/{m}" if spec_volume else m)
else:
# Not a pattern, or didn't match: keep original token for existing behavior.
_append(spec)
return expanded
def get_chapter_dependencies(self, chapter_file: Path) -> List[Path]:
"""Get dependencies for a chapter (images, includes, etc.).
Args:
chapter_file: Path to the chapter file
Returns:
List of dependency file paths
"""
dependencies = []
if not chapter_file.exists():
return dependencies
try:
content = chapter_file.read_text(encoding='utf-8')
# Find image references
image_pattern = r'!\[.*?\]\((.*?)\)'
for match in re.finditer(image_pattern, content):
image_path = match.group(1)
if not image_path.startswith('http'):
# Resolve relative to chapter file
full_path = (chapter_file.parent / image_path).resolve()
if full_path.exists():
dependencies.append(full_path)
# Find include references
include_pattern = r'{{< include (.*?) >}}'
for match in re.finditer(include_pattern, content):
include_path = match.group(1)
full_path = (chapter_file.parent / include_path).resolve()
if full_path.exists():
dependencies.append(full_path)
except Exception as e:
console.print(f"[yellow]⚠️ Error reading chapter dependencies: {e}[/yellow]")
return dependencies