Files
cs249r_book/book/tools/scripts/images/manage_external_images.py
Vijay Janapa Reddi 9781727d60 refactor: rename advanced_intro to introduction and update scripts
- Renamed vol2/advanced_intro to vol2/introduction for consistency
- Updated all scripts and configs to use vol1/ instead of core/
- Updated pre-commit config to check all contents/ not just vol1/
- Updated path references in Lua filters, Python scripts, and configs
2026-01-01 14:46:52 -05:00

561 lines
22 KiB
Python
Executable File
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
External Image Downloader for Quarto Markdown Files
This script automatically downloads external images referenced in markdown files
and organizes them locally according to the project's directory structure.
DESCRIPTION:
Processes .qmd files to find markdown images with #fig references that use
external URLs (http/https). Downloads these images and organizes them by
file type in subdirectories, then updates the markdown to reference local paths.
FEATURES:
- Smart pattern recognition for markdown figures
- Automatic file type detection and organization
- Unique filename generation to prevent conflicts
- Preserves original figure captions and IDs
- Safe dry-run mode for previewing changes
- Handles nested brackets/parentheses in captions
- Error handling and progress logging
DIRECTORY STRUCTURE:
Images are organized as: chapter/images/{file_type}/filename
- chapter/images/png/ for PNG files
- chapter/images/jpeg/ for JPEG files
- chapter/images/pdf/ for PDF files
- etc.
EXAMPLE TRANSFORMATION:
Before: ![caption](https://example.com/image.png){#fig-id}
After: ![caption](images/png/id_hash123.png){#fig-id}
USAGE:
# Process all files recursively from current directory
python3 download_external_images.py -d .
# Process all files in specific directory
python3 download_external_images.py -d book/contents/vol1
# Process single file
python3 download_external_images.py -f path/to/file.qmd
# Preview changes without downloading
python3 download_external_images.py -d . --dry-run
REQUIREMENTS:
- Python 3.6+
- requests library
- Internet connection for downloads
"""
import os
import re
import requests
import hashlib
from pathlib import Path
from urllib.parse import urlparse
import argparse
from typing import List, Tuple, Optional
import logging
# Set up logging - only show warnings and errors
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ImageDownloader:
"""
Main class for downloading and organizing external images from Quarto markdown files.
This class handles the entire workflow from finding .qmd files to downloading
images and updating markdown references.
Attributes:
base_dir (Path): Base directory containing .qmd files to process
session (requests.Session): HTTP session with browser-like headers
"""
def __init__(self, base_dir: str):
"""
Initialize the ImageDownloader.
Args:
base_dir (str): Base directory to search for .qmd files
"""
self.base_dir = Path(base_dir)
self.session = requests.Session()
# Use browser-like headers to avoid being blocked by some servers
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def find_qmd_files(self) -> List[Path]:
"""
Find all .qmd files recursively in the base directory.
Recursively searches through all subdirectories to find any .qmd files,
regardless of naming patterns or directory structure.
Returns:
List[Path]: List of paths to .qmd files found
"""
qmd_files = []
if not self.base_dir.exists():
logger.warning(f"❌ Base directory does not exist: {self.base_dir}")
return qmd_files
# Recursively find all .qmd files
for qmd_file in self.base_dir.rglob("*.qmd"):
if qmd_file.is_file():
qmd_files.append(qmd_file)
logger.debug(f"Found .qmd file: {qmd_file}")
print(f"📄 Found {len(qmd_files)} .qmd files")
return qmd_files
def extract_figure_images(self, content: str) -> List[Tuple[str, str, str, str, str]]:
r"""
Extract markdown images with or without #fig references.
Returns list of tuples: (full_match, caption, url, fig_id, attributes)
Simple strategy: Find ![, then find ALL ](url) patterns up to optional {attrs}.
Take the LAST ](url) as the image URL (others are citations).
"""
matches = []
# Simple approach: find ![, capture everything until } or end of line
# Then parse the ](url) patterns
lines = content.split('\n')
for line in lines:
if '![' not in line:
continue
# Find all image patterns on this line
idx = 0
while idx < len(line):
start = line.find('![', idx)
if start == -1:
break
# Find the end: either {...} or next ![
end_brace = line.find('}', start)
next_img = line.find('![', start + 2)
if end_brace != -1 and (next_img == -1 or end_brace < next_img):
end = end_brace + 1
elif next_img != -1:
end = next_img
else:
end = len(line)
full_match = line[start:end]
# Find ALL ](url) patterns in this match
url_patterns = list(re.finditer(r'\]\(([^)]+)\)', full_match))
if url_patterns:
# Take the LAST one - that's the image URL
url = url_patterns[-1].group(1).strip()
# Parse fig_id and attributes
fig_id = None
attributes = ""
attrs_match = re.search(r'\{([^}]+)\}', full_match)
if attrs_match:
attrs_block = attrs_match.group(1)
fig_match = re.search(r'#(fig-[^\s}]+)', attrs_block)
if fig_match:
fig_id = fig_match.group(1)
attributes = attrs_block.strip()
# Extract caption
caption_match = re.search(r'!\[([^\]]+)\]', full_match)
caption = caption_match.group(1) if caption_match else ""
# Only flag external URLs
if url.lower().startswith(('http://', 'https://')):
if not fig_id:
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
fig_id = f"fig-auto-{url_hash}"
matches.append((full_match, caption, url, fig_id, attributes))
idx = end
return matches
def get_file_extension(self, url: str, response_headers: dict) -> str:
"""Determine file extension from URL or content type."""
# First try to get extension from URL
parsed_url = urlparse(url)
path = parsed_url.path.lower()
if path.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.pdf', '.webp')):
return path.split('.')[-1]
# Try to get from content type
content_type = response_headers.get('content-type', '').lower()
content_type_map = {
'image/png': 'png',
'image/jpeg': 'jpg',
'image/jpg': 'jpg',
'image/gif': 'gif',
'image/svg+xml': 'svg',
'application/pdf': 'pdf',
'image/webp': 'webp'
}
for ct, ext in content_type_map.items():
if ct in content_type:
return ext
# Default to png if we can't determine
logger.warning(f"⚠️ Could not determine file type for {url}, defaulting to png")
return 'png'
def generate_filename(self, url: str, fig_id: str, extension: str) -> str:
"""
Generate a filename based on the original URL filename.
Extracts the original filename from the URL and uses it, adding a hash
suffix only if needed for uniqueness. This makes filenames more descriptive.
Args:
url (str): Original URL of the image
fig_id (str): Figure identifier from markdown (e.g., 'fig-example')
extension (str): File extension (e.g., 'png', 'jpg')
Returns:
str: Generated filename (e.g., 'oranges-frogs.png')
Example:
URL: https://example.com/path/oranges-frogs_abc123.png?params
Result: oranges-frogs.png
"""
# Parse the URL to get the path
parsed_url = urlparse(url)
path = parsed_url.path
# Get the filename from the path
original_filename = os.path.basename(path)
# Remove any existing extension and parameters
base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
# Clean up the filename - remove trailing hashes or IDs that are common in CDN URLs
# e.g., "oranges-frogs_nHEaTqne53" -> "oranges-frogs"
import re
# Remove trailing underscore + hash patterns (common in CDN URLs)
base_name = re.sub(r'_[a-zA-Z0-9]{8,}$', '', base_name)
# If base_name is empty or too short, fall back to fig_id
if not base_name or len(base_name) < 3:
base_name = fig_id.replace('fig-', '').replace('auto-', '')
if not base_name:
# Last resort: use URL hash
base_name = hashlib.md5(url.encode()).hexdigest()[:12]
# Sanitize the filename - keep only alphanumeric, hyphens, and underscores
base_name = re.sub(r'[^a-zA-Z0-9_-]', '-', base_name)
# Remove multiple consecutive hyphens
base_name = re.sub(r'-+', '-', base_name)
# Remove leading/trailing hyphens
base_name = base_name.strip('-')
return f"{base_name}.{extension}"
def download_image(self, url: str, output_path: Path) -> bool:
"""Download image from URL to output path."""
try:
logger.info(f"📦 Downloading {url}")
response = self.session.get(url, stream=True, timeout=30)
response.raise_for_status()
# Create directory if it doesn't exist
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
logger.info(f"✅ Successfully downloaded to {output_path}")
return True
except Exception as e:
logger.error(f"❌ Failed to download {url}: {e}")
return False
def process_file(self, qmd_file: Path, dry_run: bool = False, confirm: bool = False) -> int:
"""Process a single .qmd file and download its external images."""
logger.info(f"📄 Processing {qmd_file}")
try:
with open(qmd_file, 'r', encoding='utf-8') as f:
content = f.read()
except Exception as e:
logger.error(f"❌ Failed to read {qmd_file}: {e}")
return 0
figure_images = self.extract_figure_images(content)
if not figure_images:
logger.info(f"📄 No external figure images found in {qmd_file}")
return 0
logger.info(f"🔍 Found {len(figure_images)} external figure images")
downloaded_count = 0
new_content = content
for full_match, caption, url, fig_id, attributes in figure_images:
logger.info(f"🔍 Processing {fig_id}: {url}")
if dry_run:
logger.info(f"🧪 [DRY RUN] Would download {url} for {fig_id}")
continue
# Confirmation mode
if confirm:
print(f"\n🤔 Download external image?")
print(f" 📄 File: {qmd_file}")
print(f" 🔍 Figure: {fig_id}")
print(f" 🌐 URL: {url}")
response = input(" Download? [y/N]: ").lower().strip()
if response not in ['y', 'yes']:
logger.info(f"⏭️ Skipped {fig_id}")
continue
# Get file extension by making a HEAD request
try:
head_response = self.session.head(url, timeout=10)
extension = self.get_file_extension(url, head_response.headers)
except:
# If HEAD request fails, try GET and determine extension
extension = 'png' # fallback
# Determine chapter directory and images subdirectory
chapter_name = qmd_file.parent.name
images_dir = qmd_file.parent / "images" / extension
# Generate filename
filename = self.generate_filename(url, fig_id, extension)
output_path = images_dir / filename
# Create replacement markdown with preserved attributes
local_path = f"images/{extension}/{filename}"
# Only include fig_id if it's not auto-generated
if fig_id.startswith("fig-auto-"):
# Image didn't have a fig-id originally, don't add one
if attributes:
replacement = f"![{caption}]({local_path}){{{attributes}}}"
else:
replacement = f"![{caption}]({local_path})"
else:
# Image had a fig-id, preserve it
if attributes:
replacement = f"![{caption}]({local_path}){{#{fig_id} {attributes}}}"
else:
replacement = f"![{caption}]({local_path}){{#{fig_id}}}"
# Check if file already exists
if output_path.exists():
logger.info(f"📁 File already exists: {output_path}")
# Update the markdown anyway in case the reference is wrong
new_content = new_content.replace(full_match, replacement)
continue
# Download the image
if self.download_image(url, output_path):
# Update the markdown content to use local path
new_content = new_content.replace(full_match, replacement)
downloaded_count += 1
else:
logger.warning(f"⚠️ Skipping update for failed download: {fig_id}")
# Write updated content back to file if we made changes
if new_content != content:
try:
with open(qmd_file, 'w', encoding='utf-8') as f:
f.write(new_content)
total_updates = len(figure_images) # Count all external images that were processed
logger.info(f"✅ Updated {qmd_file} with {total_updates} local image references ({downloaded_count} downloaded)")
except Exception as e:
logger.error(f"❌ Failed to write updated content to {qmd_file}: {e}")
return downloaded_count
def process_all_files(self, dry_run: bool = False, confirm: bool = False) -> Tuple[int, int]:
"""
Process all .qmd files in the base directory.
Returns:
Tuple of (files_processed, images_downloaded)
"""
qmd_files = self.find_qmd_files()
print(f"🔍 Found {len(qmd_files)} .qmd files to process")
total_downloaded = 0
files_processed = 0
for qmd_file in qmd_files:
try:
downloaded = self.process_file(qmd_file, dry_run, confirm)
total_downloaded += downloaded
files_processed += 1
except Exception as e:
logger.error(f"❌ Error processing {qmd_file}: {e}")
return files_processed, total_downloaded
def validate_external_images(self, ignore_external: bool = False) -> Tuple[int, List[Tuple[Path, str, str]]]:
"""
Validate mode: Find external images without downloading.
Args:
ignore_external (bool): If True, only warn about external images
Returns:
Tuple of (total_files_processed, list_of_external_images)
Each external image is (file_path, fig_id, url)
"""
qmd_files = self.find_qmd_files()
print(f"🔍 Validating {len(qmd_files)} .qmd files for external images")
all_external_images = []
for qmd_file in qmd_files:
try:
with open(qmd_file, 'r', encoding='utf-8') as f:
content = f.read()
figure_images = self.extract_figure_images(content)
for full_match, caption, url, fig_id, attributes in figure_images:
all_external_images.append((qmd_file, fig_id, url))
if ignore_external:
logger.warning(f"⚠️ External image in {qmd_file}: {fig_id}{url}")
else:
logger.error(f"❌ External image found in {qmd_file}: {fig_id}{url}")
except Exception as e:
logger.error(f"❌ Error reading {qmd_file}: {e}")
return len(qmd_files), all_external_images
def main():
parser = argparse.ArgumentParser(
description="Download external images from Quarto markdown files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s -d . # Process all files recursively from current directory
%(prog)s -d book/contents/vol1 # Process all files in specific directory
%(prog)s -f chapter/file.qmd # Process single file
%(prog)s -d . --dry-run # Preview what would be downloaded
%(prog)s -d . --confirm # Ask for confirmation before each download
%(prog)s --validate book/contents/vol1 # Validate mode: fail if external images found (pre-commit)
%(prog)s --validate . --ignore-external # Validate mode: warn only (allow external images)
""")
# Mode selection (mutually exclusive)
mode_group = parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument("-f", "--file", type=str, metavar="FILE",
help="Process only a specific file")
mode_group.add_argument("-d", "--directory", type=str, metavar="DIR",
help="Process all .qmd files recursively in specified directory")
mode_group.add_argument("--validate", type=str, metavar="DIR",
help="Validate mode: check for external images and fail if found (for pre-commit)")
# Options
parser.add_argument("--dry-run", action="store_true",
help="Show what would be downloaded without actually downloading")
parser.add_argument("--confirm", action="store_true",
help="Ask for confirmation before downloading each image")
parser.add_argument("--ignore-external", action="store_true",
help="Allow external images in validate mode (warning only)")
args = parser.parse_args()
print("🔍 External Image Downloader")
print("=" * 40)
if args.validate:
# Validation mode (for pre-commit hooks)
downloader = ImageDownloader(args.validate)
files_processed, external_images = downloader.validate_external_images(args.ignore_external)
print(f"\n📊 VALIDATION SUMMARY:")
print(f" 📁 Directory: {args.validate}")
print(f" 📄 Files processed: {files_processed}")
print(f" 🌐 External images found: {len(external_images)}")
if external_images:
if args.ignore_external:
print(f" ⚠️ Mode: WARN ONLY (external images allowed)")
print(f"\n💡 To fix external images, run:")
print(f" python3 {Path(__file__).name} -d {args.validate}")
return 0
else:
print(f" ❌ Mode: STRICT (external images not allowed)")
print(f"\n💡 Found {len(external_images)} external images that need to be downloaded:")
for file_path, fig_id, url in external_images[:5]: # Show first 5
print(f" 📄 {file_path}: {fig_id}")
if len(external_images) > 5:
print(f" ... and {len(external_images) - 5} more")
print(f"\n💡 To fix, run:")
print(f" python3 {Path(__file__).name} -d {args.validate}")
print(f"\n💡 Or to allow external images, use --ignore-external flag")
return 1
else:
print(f" ✅ No external images found")
return 0
elif args.file:
# Process single file
file_path = Path(args.file)
if not file_path.exists():
logger.error(f"❌ File not found: {file_path}")
return 1
# Use the file's parent directory for organizing images
downloader = ImageDownloader(file_path.parent)
downloaded = downloader.process_file(file_path, args.dry_run, args.confirm)
print(f"\n📊 PROCESSING SUMMARY:")
print(f" 📄 File: {file_path}")
print(f" 📦 Images downloaded: {downloaded}")
if args.dry_run:
print(f" 🧪 Mode: DRY RUN (no files changed)")
elif args.confirm:
print(f" 🤔 Mode: CONFIRM (user approval required)")
elif downloaded > 0:
print(f" ✅ File updated successfully")
else:
print(f" No external images found in file")
elif args.directory:
# Process all files in specified directory
downloader = ImageDownloader(args.directory)
files_processed, downloaded = downloader.process_all_files(args.dry_run, args.confirm)
print(f"\n📊 PROCESSING SUMMARY:")
print(f" 📁 Directory: {args.directory}")
print(f" 📄 Files processed: {files_processed}")
print(f" 📦 Images downloaded: {downloaded}")
if args.dry_run:
print(f" 🧪 Mode: DRY RUN (no files changed)")
elif args.confirm:
print(f" 🤔 Mode: CONFIRM (user approval required)")
elif downloaded > 0:
print(f" ✅ Operation completed successfully")
else:
print(f" No external images found in {files_processed} files")
return 0
if __name__ == "__main__":
exit(main())