mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-03 08:08:51 -05:00
- Renamed vol2/advanced_intro to vol2/introduction for consistency - Updated all scripts and configs to use vol1/ instead of core/ - Updated pre-commit config to check all contents/ not just vol1/ - Updated path references in Lua filters, Python scripts, and configs
561 lines
22 KiB
Python
Executable File
561 lines
22 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
External Image Downloader for Quarto Markdown Files
|
||
|
||
This script automatically downloads external images referenced in markdown files
|
||
and organizes them locally according to the project's directory structure.
|
||
|
||
DESCRIPTION:
|
||
Processes .qmd files to find markdown images with #fig references that use
|
||
external URLs (http/https). Downloads these images and organizes them by
|
||
file type in subdirectories, then updates the markdown to reference local paths.
|
||
|
||
FEATURES:
|
||
- Smart pattern recognition for markdown figures
|
||
- Automatic file type detection and organization
|
||
- Unique filename generation to prevent conflicts
|
||
- Preserves original figure captions and IDs
|
||
- Safe dry-run mode for previewing changes
|
||
- Handles nested brackets/parentheses in captions
|
||
- Error handling and progress logging
|
||
|
||
DIRECTORY STRUCTURE:
|
||
Images are organized as: chapter/images/{file_type}/filename
|
||
- chapter/images/png/ for PNG files
|
||
- chapter/images/jpeg/ for JPEG files
|
||
- chapter/images/pdf/ for PDF files
|
||
- etc.
|
||
|
||
EXAMPLE TRANSFORMATION:
|
||
Before: {#fig-id}
|
||
After: {#fig-id}
|
||
|
||
USAGE:
|
||
# Process all files recursively from current directory
|
||
python3 download_external_images.py -d .
|
||
|
||
# Process all files in specific directory
|
||
python3 download_external_images.py -d book/contents/vol1
|
||
|
||
# Process single file
|
||
python3 download_external_images.py -f path/to/file.qmd
|
||
|
||
# Preview changes without downloading
|
||
python3 download_external_images.py -d . --dry-run
|
||
|
||
REQUIREMENTS:
|
||
- Python 3.6+
|
||
- requests library
|
||
- Internet connection for downloads
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import requests
|
||
import hashlib
|
||
from pathlib import Path
|
||
from urllib.parse import urlparse
|
||
import argparse
|
||
from typing import List, Tuple, Optional
|
||
import logging
|
||
|
||
# Set up logging - only show warnings and errors
|
||
logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class ImageDownloader:
|
||
"""
|
||
Main class for downloading and organizing external images from Quarto markdown files.
|
||
|
||
This class handles the entire workflow from finding .qmd files to downloading
|
||
images and updating markdown references.
|
||
|
||
Attributes:
|
||
base_dir (Path): Base directory containing .qmd files to process
|
||
session (requests.Session): HTTP session with browser-like headers
|
||
"""
|
||
|
||
def __init__(self, base_dir: str):
|
||
"""
|
||
Initialize the ImageDownloader.
|
||
|
||
Args:
|
||
base_dir (str): Base directory to search for .qmd files
|
||
"""
|
||
self.base_dir = Path(base_dir)
|
||
self.session = requests.Session()
|
||
# Use browser-like headers to avoid being blocked by some servers
|
||
self.session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||
})
|
||
|
||
def find_qmd_files(self) -> List[Path]:
|
||
"""
|
||
Find all .qmd files recursively in the base directory.
|
||
|
||
Recursively searches through all subdirectories to find any .qmd files,
|
||
regardless of naming patterns or directory structure.
|
||
|
||
Returns:
|
||
List[Path]: List of paths to .qmd files found
|
||
"""
|
||
qmd_files = []
|
||
if not self.base_dir.exists():
|
||
logger.warning(f"❌ Base directory does not exist: {self.base_dir}")
|
||
return qmd_files
|
||
|
||
# Recursively find all .qmd files
|
||
for qmd_file in self.base_dir.rglob("*.qmd"):
|
||
if qmd_file.is_file():
|
||
qmd_files.append(qmd_file)
|
||
logger.debug(f"Found .qmd file: {qmd_file}")
|
||
|
||
print(f"📄 Found {len(qmd_files)} .qmd files")
|
||
return qmd_files
|
||
|
||
def extract_figure_images(self, content: str) -> List[Tuple[str, str, str, str, str]]:
|
||
r"""
|
||
Extract markdown images with or without #fig references.
|
||
Returns list of tuples: (full_match, caption, url, fig_id, attributes)
|
||
|
||
Simple strategy: Find  patterns up to optional {attrs}.
|
||
Take the LAST ](url) as the image URL (others are citations).
|
||
"""
|
||
matches = []
|
||
|
||
# Simple approach: find  patterns
|
||
lines = content.split('\n')
|
||
|
||
for line in lines:
|
||
if '![' not in line:
|
||
continue
|
||
|
||
# Find all image patterns on this line
|
||
idx = 0
|
||
while idx < len(line):
|
||
start = line.find('![', idx)
|
||
if start == -1:
|
||
break
|
||
|
||
# Find the end: either {...} or next ![
|
||
end_brace = line.find('}', start)
|
||
next_img = line.find('![', start + 2)
|
||
|
||
if end_brace != -1 and (next_img == -1 or end_brace < next_img):
|
||
end = end_brace + 1
|
||
elif next_img != -1:
|
||
end = next_img
|
||
else:
|
||
end = len(line)
|
||
|
||
full_match = line[start:end]
|
||
|
||
# Find ALL ](url) patterns in this match
|
||
url_patterns = list(re.finditer(r'\]\(([^)]+)\)', full_match))
|
||
|
||
if url_patterns:
|
||
# Take the LAST one - that's the image URL
|
||
url = url_patterns[-1].group(1).strip()
|
||
|
||
# Parse fig_id and attributes
|
||
fig_id = None
|
||
attributes = ""
|
||
attrs_match = re.search(r'\{([^}]+)\}', full_match)
|
||
if attrs_match:
|
||
attrs_block = attrs_match.group(1)
|
||
fig_match = re.search(r'#(fig-[^\s}]+)', attrs_block)
|
||
if fig_match:
|
||
fig_id = fig_match.group(1)
|
||
attributes = attrs_block.strip()
|
||
|
||
# Extract caption
|
||
caption_match = re.search(r'!\[([^\]]+)\]', full_match)
|
||
caption = caption_match.group(1) if caption_match else ""
|
||
|
||
# Only flag external URLs
|
||
if url.lower().startswith(('http://', 'https://')):
|
||
if not fig_id:
|
||
url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
|
||
fig_id = f"fig-auto-{url_hash}"
|
||
|
||
matches.append((full_match, caption, url, fig_id, attributes))
|
||
|
||
idx = end
|
||
|
||
return matches
|
||
|
||
def get_file_extension(self, url: str, response_headers: dict) -> str:
|
||
"""Determine file extension from URL or content type."""
|
||
# First try to get extension from URL
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path.lower()
|
||
|
||
if path.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg', '.pdf', '.webp')):
|
||
return path.split('.')[-1]
|
||
|
||
# Try to get from content type
|
||
content_type = response_headers.get('content-type', '').lower()
|
||
content_type_map = {
|
||
'image/png': 'png',
|
||
'image/jpeg': 'jpg',
|
||
'image/jpg': 'jpg',
|
||
'image/gif': 'gif',
|
||
'image/svg+xml': 'svg',
|
||
'application/pdf': 'pdf',
|
||
'image/webp': 'webp'
|
||
}
|
||
|
||
for ct, ext in content_type_map.items():
|
||
if ct in content_type:
|
||
return ext
|
||
|
||
# Default to png if we can't determine
|
||
logger.warning(f"⚠️ Could not determine file type for {url}, defaulting to png")
|
||
return 'png'
|
||
|
||
def generate_filename(self, url: str, fig_id: str, extension: str) -> str:
|
||
"""
|
||
Generate a filename based on the original URL filename.
|
||
|
||
Extracts the original filename from the URL and uses it, adding a hash
|
||
suffix only if needed for uniqueness. This makes filenames more descriptive.
|
||
|
||
Args:
|
||
url (str): Original URL of the image
|
||
fig_id (str): Figure identifier from markdown (e.g., 'fig-example')
|
||
extension (str): File extension (e.g., 'png', 'jpg')
|
||
|
||
Returns:
|
||
str: Generated filename (e.g., 'oranges-frogs.png')
|
||
|
||
Example:
|
||
URL: https://example.com/path/oranges-frogs_abc123.png?params
|
||
Result: oranges-frogs.png
|
||
"""
|
||
# Parse the URL to get the path
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
|
||
# Get the filename from the path
|
||
original_filename = os.path.basename(path)
|
||
|
||
# Remove any existing extension and parameters
|
||
base_name = original_filename.rsplit('.', 1)[0] if '.' in original_filename else original_filename
|
||
|
||
# Clean up the filename - remove trailing hashes or IDs that are common in CDN URLs
|
||
# e.g., "oranges-frogs_nHEaTqne53" -> "oranges-frogs"
|
||
import re
|
||
# Remove trailing underscore + hash patterns (common in CDN URLs)
|
||
base_name = re.sub(r'_[a-zA-Z0-9]{8,}$', '', base_name)
|
||
|
||
# If base_name is empty or too short, fall back to fig_id
|
||
if not base_name or len(base_name) < 3:
|
||
base_name = fig_id.replace('fig-', '').replace('auto-', '')
|
||
if not base_name:
|
||
# Last resort: use URL hash
|
||
base_name = hashlib.md5(url.encode()).hexdigest()[:12]
|
||
|
||
# Sanitize the filename - keep only alphanumeric, hyphens, and underscores
|
||
base_name = re.sub(r'[^a-zA-Z0-9_-]', '-', base_name)
|
||
# Remove multiple consecutive hyphens
|
||
base_name = re.sub(r'-+', '-', base_name)
|
||
# Remove leading/trailing hyphens
|
||
base_name = base_name.strip('-')
|
||
|
||
return f"{base_name}.{extension}"
|
||
|
||
def download_image(self, url: str, output_path: Path) -> bool:
|
||
"""Download image from URL to output path."""
|
||
try:
|
||
logger.info(f"📦 Downloading {url}")
|
||
response = self.session.get(url, stream=True, timeout=30)
|
||
response.raise_for_status()
|
||
|
||
# Create directory if it doesn't exist
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
with open(output_path, 'wb') as f:
|
||
for chunk in response.iter_content(chunk_size=8192):
|
||
f.write(chunk)
|
||
|
||
logger.info(f"✅ Successfully downloaded to {output_path}")
|
||
return True
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ Failed to download {url}: {e}")
|
||
return False
|
||
|
||
def process_file(self, qmd_file: Path, dry_run: bool = False, confirm: bool = False) -> int:
|
||
"""Process a single .qmd file and download its external images."""
|
||
logger.info(f"📄 Processing {qmd_file}")
|
||
|
||
try:
|
||
with open(qmd_file, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
except Exception as e:
|
||
logger.error(f"❌ Failed to read {qmd_file}: {e}")
|
||
return 0
|
||
|
||
figure_images = self.extract_figure_images(content)
|
||
|
||
if not figure_images:
|
||
logger.info(f"📄 No external figure images found in {qmd_file}")
|
||
return 0
|
||
|
||
logger.info(f"🔍 Found {len(figure_images)} external figure images")
|
||
|
||
downloaded_count = 0
|
||
new_content = content
|
||
|
||
for full_match, caption, url, fig_id, attributes in figure_images:
|
||
logger.info(f"🔍 Processing {fig_id}: {url}")
|
||
|
||
if dry_run:
|
||
logger.info(f"🧪 [DRY RUN] Would download {url} for {fig_id}")
|
||
continue
|
||
|
||
# Confirmation mode
|
||
if confirm:
|
||
print(f"\n🤔 Download external image?")
|
||
print(f" 📄 File: {qmd_file}")
|
||
print(f" 🔍 Figure: {fig_id}")
|
||
print(f" 🌐 URL: {url}")
|
||
response = input(" Download? [y/N]: ").lower().strip()
|
||
if response not in ['y', 'yes']:
|
||
logger.info(f"⏭️ Skipped {fig_id}")
|
||
continue
|
||
|
||
# Get file extension by making a HEAD request
|
||
try:
|
||
head_response = self.session.head(url, timeout=10)
|
||
extension = self.get_file_extension(url, head_response.headers)
|
||
except:
|
||
# If HEAD request fails, try GET and determine extension
|
||
extension = 'png' # fallback
|
||
|
||
# Determine chapter directory and images subdirectory
|
||
chapter_name = qmd_file.parent.name
|
||
images_dir = qmd_file.parent / "images" / extension
|
||
|
||
# Generate filename
|
||
filename = self.generate_filename(url, fig_id, extension)
|
||
output_path = images_dir / filename
|
||
|
||
# Create replacement markdown with preserved attributes
|
||
local_path = f"images/{extension}/{filename}"
|
||
|
||
# Only include fig_id if it's not auto-generated
|
||
if fig_id.startswith("fig-auto-"):
|
||
# Image didn't have a fig-id originally, don't add one
|
||
if attributes:
|
||
replacement = f"{{{attributes}}}"
|
||
else:
|
||
replacement = f""
|
||
else:
|
||
# Image had a fig-id, preserve it
|
||
if attributes:
|
||
replacement = f"{{#{fig_id} {attributes}}}"
|
||
else:
|
||
replacement = f"{{#{fig_id}}}"
|
||
|
||
# Check if file already exists
|
||
if output_path.exists():
|
||
logger.info(f"📁 File already exists: {output_path}")
|
||
# Update the markdown anyway in case the reference is wrong
|
||
new_content = new_content.replace(full_match, replacement)
|
||
continue
|
||
|
||
# Download the image
|
||
if self.download_image(url, output_path):
|
||
# Update the markdown content to use local path
|
||
new_content = new_content.replace(full_match, replacement)
|
||
downloaded_count += 1
|
||
else:
|
||
logger.warning(f"⚠️ Skipping update for failed download: {fig_id}")
|
||
|
||
# Write updated content back to file if we made changes
|
||
if new_content != content:
|
||
try:
|
||
with open(qmd_file, 'w', encoding='utf-8') as f:
|
||
f.write(new_content)
|
||
total_updates = len(figure_images) # Count all external images that were processed
|
||
logger.info(f"✅ Updated {qmd_file} with {total_updates} local image references ({downloaded_count} downloaded)")
|
||
except Exception as e:
|
||
logger.error(f"❌ Failed to write updated content to {qmd_file}: {e}")
|
||
|
||
return downloaded_count
|
||
|
||
def process_all_files(self, dry_run: bool = False, confirm: bool = False) -> Tuple[int, int]:
|
||
"""
|
||
Process all .qmd files in the base directory.
|
||
|
||
Returns:
|
||
Tuple of (files_processed, images_downloaded)
|
||
"""
|
||
qmd_files = self.find_qmd_files()
|
||
print(f"🔍 Found {len(qmd_files)} .qmd files to process")
|
||
|
||
total_downloaded = 0
|
||
files_processed = 0
|
||
|
||
for qmd_file in qmd_files:
|
||
try:
|
||
downloaded = self.process_file(qmd_file, dry_run, confirm)
|
||
total_downloaded += downloaded
|
||
files_processed += 1
|
||
except Exception as e:
|
||
logger.error(f"❌ Error processing {qmd_file}: {e}")
|
||
|
||
return files_processed, total_downloaded
|
||
|
||
def validate_external_images(self, ignore_external: bool = False) -> Tuple[int, List[Tuple[Path, str, str]]]:
|
||
"""
|
||
Validate mode: Find external images without downloading.
|
||
|
||
Args:
|
||
ignore_external (bool): If True, only warn about external images
|
||
|
||
Returns:
|
||
Tuple of (total_files_processed, list_of_external_images)
|
||
Each external image is (file_path, fig_id, url)
|
||
"""
|
||
qmd_files = self.find_qmd_files()
|
||
print(f"🔍 Validating {len(qmd_files)} .qmd files for external images")
|
||
|
||
all_external_images = []
|
||
|
||
for qmd_file in qmd_files:
|
||
try:
|
||
with open(qmd_file, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
figure_images = self.extract_figure_images(content)
|
||
|
||
for full_match, caption, url, fig_id, attributes in figure_images:
|
||
all_external_images.append((qmd_file, fig_id, url))
|
||
|
||
if ignore_external:
|
||
logger.warning(f"⚠️ External image in {qmd_file}: {fig_id} → {url}")
|
||
else:
|
||
logger.error(f"❌ External image found in {qmd_file}: {fig_id} → {url}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ Error reading {qmd_file}: {e}")
|
||
|
||
return len(qmd_files), all_external_images
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Download external images from Quarto markdown files",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
%(prog)s -d . # Process all files recursively from current directory
|
||
%(prog)s -d book/contents/vol1 # Process all files in specific directory
|
||
%(prog)s -f chapter/file.qmd # Process single file
|
||
%(prog)s -d . --dry-run # Preview what would be downloaded
|
||
%(prog)s -d . --confirm # Ask for confirmation before each download
|
||
%(prog)s --validate book/contents/vol1 # Validate mode: fail if external images found (pre-commit)
|
||
%(prog)s --validate . --ignore-external # Validate mode: warn only (allow external images)
|
||
""")
|
||
|
||
# Mode selection (mutually exclusive)
|
||
mode_group = parser.add_mutually_exclusive_group(required=True)
|
||
mode_group.add_argument("-f", "--file", type=str, metavar="FILE",
|
||
help="Process only a specific file")
|
||
mode_group.add_argument("-d", "--directory", type=str, metavar="DIR",
|
||
help="Process all .qmd files recursively in specified directory")
|
||
mode_group.add_argument("--validate", type=str, metavar="DIR",
|
||
help="Validate mode: check for external images and fail if found (for pre-commit)")
|
||
|
||
# Options
|
||
parser.add_argument("--dry-run", action="store_true",
|
||
help="Show what would be downloaded without actually downloading")
|
||
parser.add_argument("--confirm", action="store_true",
|
||
help="Ask for confirmation before downloading each image")
|
||
parser.add_argument("--ignore-external", action="store_true",
|
||
help="Allow external images in validate mode (warning only)")
|
||
|
||
args = parser.parse_args()
|
||
|
||
print("🔍 External Image Downloader")
|
||
print("=" * 40)
|
||
|
||
if args.validate:
|
||
# Validation mode (for pre-commit hooks)
|
||
downloader = ImageDownloader(args.validate)
|
||
files_processed, external_images = downloader.validate_external_images(args.ignore_external)
|
||
|
||
print(f"\n📊 VALIDATION SUMMARY:")
|
||
print(f" 📁 Directory: {args.validate}")
|
||
print(f" 📄 Files processed: {files_processed}")
|
||
print(f" 🌐 External images found: {len(external_images)}")
|
||
|
||
if external_images:
|
||
if args.ignore_external:
|
||
print(f" ⚠️ Mode: WARN ONLY (external images allowed)")
|
||
print(f"\n💡 To fix external images, run:")
|
||
print(f" python3 {Path(__file__).name} -d {args.validate}")
|
||
return 0
|
||
else:
|
||
print(f" ❌ Mode: STRICT (external images not allowed)")
|
||
print(f"\n💡 Found {len(external_images)} external images that need to be downloaded:")
|
||
for file_path, fig_id, url in external_images[:5]: # Show first 5
|
||
print(f" 📄 {file_path}: {fig_id}")
|
||
if len(external_images) > 5:
|
||
print(f" ... and {len(external_images) - 5} more")
|
||
print(f"\n💡 To fix, run:")
|
||
print(f" python3 {Path(__file__).name} -d {args.validate}")
|
||
print(f"\n💡 Or to allow external images, use --ignore-external flag")
|
||
return 1
|
||
else:
|
||
print(f" ✅ No external images found")
|
||
return 0
|
||
|
||
elif args.file:
|
||
# Process single file
|
||
file_path = Path(args.file)
|
||
if not file_path.exists():
|
||
logger.error(f"❌ File not found: {file_path}")
|
||
return 1
|
||
|
||
# Use the file's parent directory for organizing images
|
||
downloader = ImageDownloader(file_path.parent)
|
||
downloaded = downloader.process_file(file_path, args.dry_run, args.confirm)
|
||
|
||
print(f"\n📊 PROCESSING SUMMARY:")
|
||
print(f" 📄 File: {file_path}")
|
||
print(f" 📦 Images downloaded: {downloaded}")
|
||
if args.dry_run:
|
||
print(f" 🧪 Mode: DRY RUN (no files changed)")
|
||
elif args.confirm:
|
||
print(f" 🤔 Mode: CONFIRM (user approval required)")
|
||
elif downloaded > 0:
|
||
print(f" ✅ File updated successfully")
|
||
else:
|
||
print(f" ℹ️ No external images found in file")
|
||
|
||
elif args.directory:
|
||
# Process all files in specified directory
|
||
downloader = ImageDownloader(args.directory)
|
||
files_processed, downloaded = downloader.process_all_files(args.dry_run, args.confirm)
|
||
|
||
print(f"\n📊 PROCESSING SUMMARY:")
|
||
print(f" 📁 Directory: {args.directory}")
|
||
print(f" 📄 Files processed: {files_processed}")
|
||
print(f" 📦 Images downloaded: {downloaded}")
|
||
if args.dry_run:
|
||
print(f" 🧪 Mode: DRY RUN (no files changed)")
|
||
elif args.confirm:
|
||
print(f" 🤔 Mode: CONFIRM (user approval required)")
|
||
elif downloaded > 0:
|
||
print(f" ✅ Operation completed successfully")
|
||
else:
|
||
print(f" ℹ️ No external images found in {files_processed} files")
|
||
|
||
return 0
|
||
|
||
if __name__ == "__main__":
|
||
exit(main())
|