Files
cs249r_book/book/tools/scripts/_archive/obsolete/collect_pdfs.py
Vijay Janapa Reddi e3cc9f7af3 refactor: rename ml_ml_workflow files, consolidate CLI, and clean up scripts
Remove redundant ml_ prefix from ml_workflow chapter files and update all
Quarto config references. Consolidate custom scripts into native binder
subcommands and archive obsolete tooling.
2026-02-13 11:06:28 -05:00

187 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Collect and organize chapter PDFs from build artifacts.
This script:
1. Reads the chapter order from _quarto.yml
2. Finds built PDFs in the artifacts directories
3. Copies them to a single directory with numbered prefixes matching the book order
4. Optionally keeps the detailed folder structure
"""
import os
import shutil
import yaml
from pathlib import Path
import argparse
def extract_chapter_slug(qmd_path):
"""Extract the chapter slug from a qmd path.
Examples:
- 'contents/vol1/introduction/introduction.qmd' -> 'introduction'
- 'contents/vol1/frontmatter/foreword.qmd' -> 'foreword'
- 'contents/vol1/backmatter/references.qmd' -> 'references'
"""
path = Path(qmd_path)
# Skip part dividers (they don't have PDFs)
if path.stem in ['foundations_principles', 'build_principles',
'optimize_principles', 'deploy_principles']:
return None
# Skip index
if path.stem == 'index':
return None
# For frontmatter and backmatter, use the file stem
# For regular chapters, use the parent directory name
parent_name = path.parent.name
file_stem = path.stem
if parent_name in ['frontmatter', 'backmatter']:
# Use the file name (e.g., 'foreword' from 'foreword.qmd')
return file_stem
elif parent_name == 'glossary':
# Glossary is special - use 'glossary'
return 'glossary'
elif parent_name == file_stem:
# For regular chapters where dir matches file (e.g., introduction/introduction.qmd)
# Return the parent directory name
return parent_name
else:
# For cases where parent dir differs from file name
# (e.g., optimizations/model_compression.qmd -> 'model_compression')
return file_stem
def read_chapter_order(quarto_yml_path):
"""Read the chapter order from _quarto.yml."""
with open(quarto_yml_path, 'r') as f:
config = yaml.safe_load(f)
chapters = []
# Process main chapters
if 'book' in config and 'chapters' in config['book']:
for chapter_path in config['book']['chapters']:
slug = extract_chapter_slug(chapter_path)
if slug and slug != 'index': # Skip index and part dividers
chapters.append(slug)
# Process appendices
if 'book' in config and 'appendices' in config['book']:
for appendix_path in config['book']['appendices']:
slug = extract_chapter_slug(appendix_path)
if slug:
chapters.append(slug)
return chapters
def find_pdf(chapter_slug, logs_dir, vol):
"""Find the PDF for a given chapter."""
artifacts_dir = logs_dir / vol / chapter_slug / 'artifacts'
if not artifacts_dir.exists():
return None
# Look for PDF files
pdf_files = list(artifacts_dir.glob('*.pdf'))
if not pdf_files:
return None
# Return the first PDF found (should be only one)
return pdf_files[0]
def collect_pdfs(vol='vol1', output_dir=None, keep_structure=False):
"""Collect and organize PDFs from build artifacts."""
# Paths
book_dir = Path(__file__).parent.parent.parent.parent
quarto_yml = book_dir / 'quarto' / '_quarto.yml'
logs_dir = book_dir / 'tools' / 'scripts' / 'testing' / 'logs'
if output_dir is None:
output_dir = logs_dir / f'{vol}_collected_pdfs'
else:
output_dir = Path(output_dir)
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Read chapter order
print(f"Reading chapter order from {quarto_yml}...")
chapters = read_chapter_order(quarto_yml)
print(f"\nFound {len(chapters)} chapters in order:")
for i, chapter in enumerate(chapters, 1):
print(f" {i:02d}. {chapter}")
# Collect PDFs
print(f"\nCollecting PDFs to {output_dir}...")
collected = []
missing = []
for i, chapter_slug in enumerate(chapters, 1):
pdf_path = find_pdf(chapter_slug, logs_dir, vol)
if pdf_path:
# Create new filename with number prefix
new_filename = f"{i:02d}_{chapter_slug}.pdf"
output_path = output_dir / new_filename
# Copy the PDF
shutil.copy2(pdf_path, output_path)
collected.append((chapter_slug, output_path))
print(f"{i:02d}. {chapter_slug} -> {new_filename}")
else:
missing.append(chapter_slug)
print(f"{i:02d}. {chapter_slug} (PDF not found)")
# Summary
print(f"\n{'='*70}")
print(f"SUMMARY")
print(f"{'='*70}")
print(f" Total chapters: {len(chapters)}")
print(f" Collected: {len(collected)}")
print(f" Missing: {len(missing)}")
print(f"\n Output directory: {output_dir}")
if missing:
print(f"\n Missing PDFs for:")
for chapter in missing:
print(f" - {chapter}")
print(f"{'='*70}\n")
return collected, missing
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Collect and organize chapter PDFs from build artifacts'
)
parser.add_argument(
'--vol',
default='vol1',
help='Volume to collect (default: vol1)'
)
parser.add_argument(
'--output',
'-o',
help='Output directory (default: logs/<vol>_collected_pdfs)'
)
parser.add_argument(
'--keep-structure',
action='store_true',
help='Keep detailed folder structure (not implemented yet)'
)
args = parser.parse_args()
collect_pdfs(vol=args.vol, output_dir=args.output, keep_structure=args.keep_structure)