refactor: move publish scripts to quarto/publish/ for better architecture

Move compression scripts from tools/scripts/publish/ to quarto/publish/:
- compress_epub.py → quarto/publish/compress_epub.py
- compress_pdf.py → quarto/publish/compress_pdf.py

Rationale:
- Publishing scripts are part of Quarto workflow, not general tooling
- quarto/ directory is mounted in containers, tools/ is not
- Cleaner separation: tools/ = environment, quarto/publish/ = content packaging
- Fixes container path issues where tools/scripts/publish/ was not available

Updated all workflow references to use new paths.
This should resolve the 'No such file or directory' errors in containers.
This commit is contained in:
Vijay Janapa Reddi
2025-08-21 21:25:09 -04:00
parent 2911369115
commit 1a6dfdb528
4 changed files with 833 additions and 12 deletions

View File

@@ -626,7 +626,7 @@ jobs:
run: |
if [ -f "Machine-Learning-Systems.pdf" ]; then
echo "📉 Compressing PDF with professional compression tool..."
python3 ${{ github.workspace }}/tools/scripts/publish/compress_pdf.py \
python3 ${{ github.workspace }}/quarto/publish/compress_pdf.py \
--input "Machine-Learning-Systems.pdf" \
--output "compressed.pdf" \
--quality minimal \
@@ -657,7 +657,7 @@ jobs:
Write-Output "📉 Compressing PDF with professional compression tool..."
python ${{ github.workspace }}/tools/scripts/publish/compress_pdf.py --input $input --output $output --quality minimal --verbose
python ${{ github.workspace }}/quarto/publish/compress_pdf.py --input $input --output $output --quality minimal --verbose
if (Test-Path $output) {
Write-Output "✅ PDF compression completed"
@@ -674,9 +674,9 @@ jobs:
echo "📚 Compressing EPUB with optimized compression tool..."
echo "🔍 DEBUG: GITHUB_WORKSPACE=${{ github.workspace }}"
echo "🔍 DEBUG: PWD=$(pwd)"
echo "🔍 DEBUG: Script path: ${{ github.workspace }}/tools/scripts/publish/compress_epub.py"
ls -la "${{ github.workspace }}/tools/scripts/publish/" || echo "❌ Directory not found"
python3 ${{ github.workspace }}/tools/scripts/publish/compress_epub.py \
echo "🔍 DEBUG: Script path: ${{ github.workspace }}/quarto/publish/compress_epub.py"
ls -la "${{ github.workspace }}/quarto/publish/" || echo "❌ Directory not found"
python3 ${{ github.workspace }}/quarto/publish/compress_epub.py \
--input "Machine-Learning-Systems.epub" \
--output "compressed.epub" \
--verbose
@@ -706,7 +706,7 @@ jobs:
Write-Output "📚 Compressing EPUB with optimized compression tool..."
python ${{ github.workspace }}/tools/scripts/publish/compress_epub.py --input $input --output $output --verbose
python ${{ github.workspace }}/quarto/publish/compress_epub.py --input $input --output $output --verbose
if (Test-Path $output) {
Write-Output "✅ EPUB compression completed (using optimized defaults: quality=50, max-size=1000px)"

View File

@@ -255,7 +255,7 @@ jobs:
run: |
if [ -f "Machine-Learning-Systems.pdf" ]; then
echo "📉 Compressing PDF with professional compression tool..."
python3 ${{ github.workspace }}/tools/scripts/publish/compress_pdf.py \
python3 ${{ github.workspace }}/quarto/publish/compress_pdf.py \
--input "Machine-Learning-Systems.pdf" \
--output "compressed.pdf" \
--quality minimal \
@@ -274,7 +274,7 @@ jobs:
docker run --rm -v "$($PWD.Path):C:\workspace" -w "C:\workspace\quarto\${{ matrix.output_dir }}" ${{ env.CONTAINER_IMAGE }} powershell -Command "
if (Test-Path 'Machine-Learning-Systems.pdf') {
Write-Host '📉 Compressing PDF with professional compression tool...'
python C:\workspace\tools\scripts\publish\compress_pdf.py --input 'Machine-Learning-Systems.pdf' --output 'compressed.pdf' --quality minimal --verbose
python C:\workspace\quarto\publish\compress_pdf.py --input 'Machine-Learning-Systems.pdf' --output 'compressed.pdf' --quality minimal --verbose
if (Test-Path 'compressed.pdf') {
Move-Item -Force 'compressed.pdf' 'Machine-Learning-Systems.pdf'
Write-Host '✅ PDF compression completed'
@@ -293,9 +293,9 @@ jobs:
echo "📚 Compressing EPUB with optimized compression tool..."
echo "🔍 DEBUG: GITHUB_WORKSPACE=${{ github.workspace }}"
echo "🔍 DEBUG: PWD=$(pwd)"
echo "🔍 DEBUG: Script path: ${{ github.workspace }}/tools/scripts/publish/compress_epub.py"
ls -la "${{ github.workspace }}/tools/scripts/publish/" || echo "❌ Directory not found"
python3 ${{ github.workspace }}/tools/scripts/publish/compress_epub.py \
echo "🔍 DEBUG: Script path: ${{ github.workspace }}/quarto/publish/compress_epub.py"
ls -la "${{ github.workspace }}/quarto/publish/" || echo "❌ Directory not found"
python3 ${{ github.workspace }}/quarto/publish/compress_epub.py \
--input "Machine-Learning-Systems.epub" \
--output "compressed.epub" \
--verbose
@@ -313,7 +313,7 @@ jobs:
docker run --rm -v "$($PWD.Path):C:\workspace" -w "C:\workspace\quarto\${{ matrix.output_dir }}" ${{ env.CONTAINER_IMAGE }} powershell -Command "
if (Test-Path 'Machine-Learning-Systems.epub') {
Write-Host '📚 Compressing EPUB with optimized compression tool...'
python C:\workspace\tools\scripts\publish\compress_epub.py --input 'Machine-Learning-Systems.epub' --output 'compressed.epub' --verbose
python C:\workspace\quarto\publish\compress_epub.py --input 'Machine-Learning-Systems.epub' --output 'compressed.epub' --verbose
if (Test-Path 'compressed.epub') {
Move-Item -Force 'compressed.epub' 'Machine-Learning-Systems.epub'
Write-Host '✅ EPUB compression completed (using optimized defaults: quality=50, max-size=1000px)'

435
quarto/publish/compress_epub.py Executable file
View File

@@ -0,0 +1,435 @@
#!/usr/bin/env python3
"""
EPUB Compression Tool for MLSysBook
This tool compresses EPUB files by optimizing embedded images while maintaining
EPUB format compliance. It extracts the EPUB, compresses images, and repacks
the archive following EPUB specifications.
Usage:
python compress_epub.py --input input.epub --output output.epub [options]
Author: MLSysBook Team
License: MIT
"""
import argparse
import logging
import os
import shutil
import sys
import tempfile
import zipfile
from pathlib import Path
from typing import Optional, Tuple
try:
from PIL import Image
except ImportError:
print("❌ Error: Pillow library is required. Install with: pip install Pillow")
sys.exit(1)
class EPUBCompressor:
"""
A class for compressing EPUB files by optimizing embedded images.
This compressor maintains EPUB format compliance while reducing file size
through image optimization techniques including quality reduction, resizing,
and format optimization.
"""
SUPPORTED_IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff')
def __init__(self, quality: int = 50, max_size: int = 1000, verbose: bool = False):
"""
Initialize the EPUB compressor.
Args:
quality: JPEG compression quality (1-100, higher = better quality)
max_size: Maximum dimension for image resizing (pixels)
verbose: Enable verbose logging output
"""
self.quality = quality
self.max_size = max_size
self.verbose = verbose
self._setup_logging()
def _setup_logging(self) -> None:
"""Configure logging based on verbosity level."""
level = logging.DEBUG if self.verbose else logging.INFO
logging.basicConfig(
level=level,
format='%(levelname)s: %(message)s',
handlers=[logging.StreamHandler()]
)
self.logger = logging.getLogger(__name__)
def _validate_inputs(self, input_path: Path, output_path: Path) -> None:
"""
Validate input parameters and file paths.
Args:
input_path: Path to input EPUB file
output_path: Path for output EPUB file
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If parameters are invalid
"""
if not input_path.exists():
raise FileNotFoundError(f"Input EPUB file not found: {input_path}")
if not input_path.suffix.lower() == '.epub':
raise ValueError(f"Input file must be an EPUB: {input_path}")
if not 1 <= self.quality <= 100:
raise ValueError(f"Quality must be between 1-100, got: {self.quality}")
if not 100 <= self.max_size <= 5000:
raise ValueError(f"Max size must be between 100-5000 pixels, got: {self.max_size}")
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
self.logger.info(f"📖 Input EPUB: {input_path}")
self.logger.info(f"📦 Output EPUB: {output_path}")
self.logger.info(f"🎨 Image quality: {self.quality}%")
self.logger.info(f"📏 Max image size: {self.max_size}px")
def _compress_image(self, image_path: Path) -> Tuple[bool, Optional[str]]:
"""
Compress a single image file in place.
Args:
image_path: Path to the image file to compress
Returns:
Tuple of (success: bool, error_message: Optional[str])
"""
try:
original_size = image_path.stat().st_size
with Image.open(image_path) as img:
img_format = img.format
original_dimensions = img.size
# Resize if image is too large
if max(img.size) > self.max_size:
# Use backward-compatible resampling for older Pillow versions
try:
# Pillow >= 10.0.0
resample = Image.Resampling.LANCZOS
except AttributeError:
# Pillow < 10.0.0
resample = Image.LANCZOS
img.thumbnail((self.max_size, self.max_size), resample)
self.logger.debug(f" 📏 Resized {original_dimensions}{img.size}")
# Optimize based on format
if img_format in ('JPEG', 'JPG'):
# Convert RGBA to RGB if needed (simpler approach)
if img.mode in ('RGBA', 'LA'):
rgb_img = Image.new('RGB', img.size, (255, 255, 255))
rgb_img.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
img = rgb_img
img.save(image_path, 'JPEG', quality=self.quality, optimize=True)
elif img_format == 'PNG':
# Always try aggressive palette conversion for maximum compression
try:
img = img.convert('P', palette=Image.ADAPTIVE)
img.save(image_path, 'PNG', optimize=True)
except Exception:
# Fallback to original PNG optimization if palette conversion fails
img.save(image_path, 'PNG', optimize=True)
else:
# For other formats, convert to JPEG if RGB, PNG if has transparency
if img.mode in ('RGBA', 'LA'):
img.save(image_path, 'PNG', optimize=True)
else:
if img.mode != 'RGB':
img = img.convert('RGB')
img.save(image_path, 'JPEG', quality=self.quality, optimize=True)
new_size = image_path.stat().st_size
compression_ratio = (1 - new_size / original_size) * 100 if original_size > 0 else 0
self.logger.debug(f" 💾 {original_size:,}{new_size:,} bytes ({compression_ratio:.1f}% reduction)")
return True, None
except Exception as e:
error_msg = f"Failed to compress {image_path.name}: {str(e)}"
self.logger.warning(f" ⚠️ {error_msg}")
return False, error_msg
def _extract_epub(self, epub_path: Path, extract_dir: Path) -> None:
"""
Extract EPUB contents to temporary directory.
Args:
epub_path: Path to EPUB file
extract_dir: Directory to extract contents to
"""
self.logger.info("📂 Extracting EPUB contents...")
try:
with zipfile.ZipFile(epub_path, 'r') as zip_file:
zip_file.extractall(extract_dir)
self.logger.debug(f" ✅ Extracted to: {extract_dir}")
except zipfile.BadZipFile:
raise ValueError(f"Invalid EPUB file (not a valid ZIP): {epub_path}")
except Exception as e:
raise RuntimeError(f"Failed to extract EPUB: {str(e)}")
def _compress_images_in_directory(self, directory: Path) -> Tuple[int, int]:
"""
Find and compress all images in the extracted EPUB directory.
Args:
directory: Root directory to search for images
Returns:
Tuple of (total_images: int, compressed_images: int)
"""
self.logger.info("🎨 Compressing images...")
image_files = []
for ext in self.SUPPORTED_IMAGE_EXTENSIONS:
image_files.extend(directory.rglob(f'*{ext}'))
image_files.extend(directory.rglob(f'*{ext.upper()}'))
total_images = len(image_files)
compressed_images = 0
if total_images == 0:
self.logger.info(" No images found to compress")
return 0, 0
self.logger.info(f" 📊 Found {total_images} images to process")
for i, image_path in enumerate(image_files, 1):
self.logger.debug(f" 🖼️ [{i}/{total_images}] {image_path.name}")
success, error = self._compress_image(image_path)
if success:
compressed_images += 1
self.logger.info(f" ✅ Successfully compressed {compressed_images}/{total_images} images")
return total_images, compressed_images
def _repack_epub(self, source_dir: Path, output_path: Path) -> None:
"""
Repack the directory contents into a new EPUB file.
Args:
source_dir: Directory containing extracted and processed EPUB contents
output_path: Path for the output EPUB file
"""
self.logger.info("📦 Repacking EPUB...")
try:
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
# First, add mimetype uncompressed (EPUB specification requirement)
mimetype_path = source_dir / 'mimetype'
if mimetype_path.exists():
zip_file.write(mimetype_path, 'mimetype', compress_type=zipfile.ZIP_STORED)
self.logger.debug(" 📄 Added mimetype (uncompressed)")
# Add all other files with compression
files_added = 0
for file_path in source_dir.rglob('*'):
if file_path.is_file() and file_path.name != 'mimetype':
arcname = file_path.relative_to(source_dir)
zip_file.write(file_path, arcname)
files_added += 1
self.logger.debug(f" ✅ Added {files_added} files to EPUB")
except Exception as e:
raise RuntimeError(f"Failed to repack EPUB: {str(e)}")
def compress(self, input_path: Path, output_path: Path) -> dict:
"""
Compress an EPUB file by optimizing embedded images.
Args:
input_path: Path to input EPUB file
output_path: Path for compressed output EPUB file
Returns:
Dictionary with compression statistics
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If input parameters are invalid
RuntimeError: If compression process fails
"""
# Validate inputs
self._validate_inputs(input_path, output_path)
# Get original file size
original_size = input_path.stat().st_size
self.logger.info(f"📊 Original EPUB size: {original_size:,} bytes ({original_size/1024/1024:.1f} MB)")
# Create temporary directory for processing
with tempfile.TemporaryDirectory(prefix='epub_compress_') as temp_dir:
temp_path = Path(temp_dir)
try:
# Extract EPUB
self._extract_epub(input_path, temp_path)
# Compress images
total_images, compressed_images = self._compress_images_in_directory(temp_path)
# Repack EPUB
self._repack_epub(temp_path, output_path)
except Exception as e:
# Clean up output file if it was partially created
if output_path.exists():
output_path.unlink()
raise e
# Calculate final statistics
final_size = output_path.stat().st_size
compression_ratio = (1 - final_size / original_size) * 100 if original_size > 0 else 0
stats = {
'original_size': original_size,
'final_size': final_size,
'compression_ratio': compression_ratio,
'size_saved': original_size - final_size,
'total_images': total_images,
'compressed_images': compressed_images
}
self.logger.info(f"✅ Compression complete!")
self.logger.info(f"📊 Final size: {final_size:,} bytes ({final_size/1024/1024:.1f} MB)")
self.logger.info(f"💾 Size reduction: {compression_ratio:.1f}% ({stats['size_saved']:,} bytes saved)")
return stats
def create_argument_parser() -> argparse.ArgumentParser:
"""Create and configure the command-line argument parser."""
parser = argparse.ArgumentParser(
description="Compress EPUB files by optimizing embedded images while maintaining format compliance.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --input input.epub --output output.epub
%(prog)s -i input.epub -o output.epub
%(prog)s -i input.epub -o output.epub --quality 60 --max-size 1200
%(prog)s -i input.epub -o output.epub --verbose
%(prog)s -i input.epub -o output.epub -q 40 -s 800 -v
Quality Guidelines:
90-100: Highest quality, larger files
50-89: Good quality, balanced size (recommended)
35-49: Acceptable quality, smaller files
1-34: Lower quality, smallest files
Max Size Guidelines:
1000px: Default, optimized balance of quality and size
1200px: Higher quality for detailed images
800px: Compact, suitable for basic readers
600px: Maximum compression for size-critical applications
"""
)
parser.add_argument(
'--input', '-i',
type=Path,
required=True,
metavar='EPUB_FILE',
help='Path to the input EPUB file to compress'
)
parser.add_argument(
'--output', '-o',
type=Path,
required=True,
metavar='EPUB_FILE',
help='Path for the compressed output EPUB file'
)
parser.add_argument(
'--quality', '-q',
type=int,
default=50,
metavar='N',
help='JPEG compression quality (1-100, default: 50)'
)
parser.add_argument(
'--max-size', '-s',
type=int,
default=1000,
metavar='PIXELS',
help='Maximum image dimension in pixels (default: 1000)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Enable verbose output with detailed progress information'
)
parser.add_argument(
'--version',
action='version',
version='%(prog)s 1.0.0'
)
return parser
def main() -> int:
"""
Main entry point for the EPUB compression tool.
Returns:
Exit code (0 for success, 1 for error)
"""
parser = create_argument_parser()
args = parser.parse_args()
try:
# Create compressor instance
compressor = EPUBCompressor(
quality=args.quality,
max_size=args.max_size,
verbose=args.verbose
)
# Perform compression
stats = compressor.compress(args.input, args.output)
# Success message
print(f"\n🎉 EPUB compression successful!")
print(f"📁 Output: {args.output}")
print(f"💾 Size reduction: {stats['compression_ratio']:.1f}%")
print(f"🖼️ Images processed: {stats['compressed_images']}/{stats['total_images']}")
return 0
except KeyboardInterrupt:
print("\n❌ Operation cancelled by user")
return 1
except Exception as e:
print(f"\n❌ Error: {str(e)}")
if args.verbose:
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
sys.exit(main())

386
quarto/publish/compress_pdf.py Executable file
View File

@@ -0,0 +1,386 @@
#!/usr/bin/env python3
"""
PDF Compression Tool for MLSysBook
This tool compresses PDF files using Ghostscript with optimized settings for
academic textbooks. It reduces file size while maintaining readability and
print quality suitable for educational content.
Usage:
python compress_pdf.py --input input.pdf --output output.pdf [options]
Author: MLSysBook Team
License: MIT
"""
import argparse
import logging
import os
import platform
import shutil
import subprocess
import sys
from pathlib import Path
from typing import Optional
class PDFCompressor:
"""
A class for compressing PDF files using Ghostscript.
This compressor uses Ghostscript with optimized settings for academic
textbooks, balancing file size reduction with quality preservation
for educational content.
"""
# Ghostscript quality presets
QUALITY_PRESETS = {
'screen': '/screen', # Lowest quality, smallest files (72 dpi)
'ebook': '/ebook', # Good for e-readers (150 dpi) - DEFAULT
'printer': '/printer', # Good for printing (300 dpi)
'prepress': '/prepress', # Highest quality (300+ dpi)
'default': '/default', # Ghostscript default settings
'minimal': '/ebook' # Minimal mode - matches original workflow exactly
}
def __init__(self, quality: str = 'ebook', compatibility: str = '1.4', verbose: bool = False):
"""
Initialize the PDF compressor.
Args:
quality: Compression quality preset (screen, ebook, printer, prepress, default)
compatibility: PDF compatibility level (1.3, 1.4, 1.5, 1.6, 1.7)
verbose: Enable verbose logging output
"""
self.quality = quality
self.compatibility = compatibility
self.verbose = verbose
self._setup_logging()
self._validate_dependencies()
def _setup_logging(self) -> None:
"""Configure logging based on verbosity level."""
level = logging.DEBUG if self.verbose else logging.INFO
logging.basicConfig(
level=level,
format='%(levelname)s: %(message)s',
handlers=[logging.StreamHandler()]
)
self.logger = logging.getLogger(__name__)
def _validate_dependencies(self) -> None:
"""Check if Ghostscript is available and determine the correct executable."""
# Determine platform-specific Ghostscript executable
if platform.system() == 'Windows':
# On Windows, try gswin64c first, then gs
gs_candidates = ['gswin64c', 'gs']
else:
# On Linux/macOS, use gs
gs_candidates = ['gs']
self.gs_executable = None
for gs_cmd in gs_candidates:
try:
result = subprocess.run([gs_cmd, '--version'],
capture_output=True, text=True, check=True)
gs_version = result.stdout.strip()
self.gs_executable = gs_cmd
self.logger.debug(f"Found Ghostscript executable: {gs_cmd}")
self.logger.debug(f"Ghostscript version: {gs_version}")
break
except (subprocess.CalledProcessError, FileNotFoundError):
continue
if not self.gs_executable:
raise RuntimeError(
"Ghostscript is not installed or not in PATH. "
f"Tried: {', '.join(gs_candidates)}. "
"Please install Ghostscript to use this tool."
)
def _validate_inputs(self, input_path: Path, output_path: Path) -> None:
"""
Validate input parameters and file paths.
Args:
input_path: Path to input PDF file
output_path: Path for output PDF file
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If parameters are invalid
"""
if not input_path.exists():
raise FileNotFoundError(f"Input PDF file not found: {input_path}")
if not input_path.suffix.lower() == '.pdf':
raise ValueError(f"Input file must be a PDF: {input_path}")
if self.quality not in self.QUALITY_PRESETS:
raise ValueError(f"Quality must be one of {list(self.QUALITY_PRESETS.keys())}, got: {self.quality}")
if self.compatibility not in ['1.3', '1.4', '1.5', '1.6', '1.7']:
raise ValueError(f"Compatibility must be 1.3-1.7, got: {self.compatibility}")
# Ensure output directory exists
output_path.parent.mkdir(parents=True, exist_ok=True)
self.logger.info(f"📄 Input PDF: {input_path}")
self.logger.info(f"📦 Output PDF: {output_path}")
self.logger.info(f"🎨 Quality preset: {self.quality}")
self.logger.info(f"📋 PDF compatibility: {self.compatibility}")
def _format_file_size(self, size_bytes: int) -> str:
"""Convert bytes to human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB']:
if size_bytes < 1024.0:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f} TB"
def _build_ghostscript_command(self, input_path: Path, output_path: Path) -> list[str]:
"""
Build the Ghostscript command with optimized parameters.
Args:
input_path: Path to input PDF file
output_path: Path for output PDF file
Returns:
List of command arguments for subprocess
"""
quality_setting = self.QUALITY_PRESETS[self.quality]
if self.quality == 'minimal':
# Minimal mode: exactly match original workflow commands
command = [
self.gs_executable,
'-sDEVICE=pdfwrite',
f'-dCompatibilityLevel={self.compatibility}',
f'-dPDFSETTINGS={quality_setting}',
'-dNOPAUSE',
'-dQUIET' if not self.verbose else '-dNOQUIET',
'-dBATCH',
f'-sOutputFile={output_path}',
str(input_path)
]
else:
# Enhanced mode: with additional quality improvements
command = [
self.gs_executable, # Use platform-specific executable
'-sDEVICE=pdfwrite',
f'-dCompatibilityLevel={self.compatibility}',
f'-dPDFSETTINGS={quality_setting}',
'-dNOPAUSE',
'-dQUIET' if not self.verbose else '-dNOQUIET',
'-dBATCH',
'-dSAFER', # Security setting
'-dAutoRotatePages=/None', # Preserve page orientation
'-dColorImageDownsampleType=/Bicubic', # Better image quality
'-dGrayImageDownsampleType=/Bicubic',
'-dMonoImageDownsampleType=/Bicubic',
f'-sOutputFile={output_path}',
str(input_path)
]
return command
def compress(self, input_path: Path, output_path: Path) -> dict:
"""
Compress a PDF file using Ghostscript.
Args:
input_path: Path to input PDF file
output_path: Path for compressed output PDF file
Returns:
Dictionary with compression statistics
Raises:
FileNotFoundError: If input file doesn't exist
ValueError: If input parameters are invalid
RuntimeError: If compression process fails
"""
# Validate inputs
self._validate_inputs(input_path, output_path)
# Get original file size
original_size = input_path.stat().st_size
self.logger.info(f"📊 Original PDF size: {original_size:,} bytes ({self._format_file_size(original_size)})")
# Build Ghostscript command
command = self._build_ghostscript_command(input_path, output_path)
self.logger.info("🔄 Compressing PDF with Ghostscript...")
self.logger.debug(f"Command: {' '.join(command)}")
try:
# Run Ghostscript compression
result = subprocess.run(
command,
check=True,
capture_output=not self.verbose,
text=True
)
self.logger.debug(f"Ghostscript return code: {result.returncode}")
except subprocess.CalledProcessError as e:
# Clean up output file if it was partially created
if output_path.exists():
output_path.unlink()
error_msg = f"Ghostscript compression failed (exit code {e.returncode})"
if e.stderr:
error_msg += f": {e.stderr.strip()}"
raise RuntimeError(error_msg)
except Exception as e:
# Clean up output file if it was partially created
if output_path.exists():
output_path.unlink()
raise RuntimeError(f"PDF compression failed: {str(e)}")
# Verify output file was created
if not output_path.exists():
raise RuntimeError("Ghostscript completed but output file was not created")
# Calculate final statistics
final_size = output_path.stat().st_size
compression_ratio = (1 - final_size / original_size) * 100 if original_size > 0 else 0
stats = {
'original_size': original_size,
'final_size': final_size,
'compression_ratio': compression_ratio,
'size_saved': original_size - final_size,
'quality_preset': self.quality,
'pdf_compatibility': self.compatibility
}
self.logger.info(f"✅ Compression complete!")
self.logger.info(f"📊 Final size: {final_size:,} bytes ({self._format_file_size(final_size)})")
self.logger.info(f"💾 Size reduction: {compression_ratio:.1f}% ({self._format_file_size(stats['size_saved'])} saved)")
return stats
def create_argument_parser() -> argparse.ArgumentParser:
"""Create and configure the command-line argument parser."""
parser = argparse.ArgumentParser(
description="Compress PDF files using Ghostscript with optimized settings for academic textbooks.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s --input input.pdf --output output.pdf
%(prog)s -i input.pdf -o output.pdf
%(prog)s -i input.pdf -o output.pdf --quality printer
%(prog)s -i input.pdf -o output.pdf --verbose
%(prog)s -i input.pdf -o output.pdf -q screen -c 1.5 -v
Quality Presets:
screen: Lowest quality, smallest files (72 dpi) - for web viewing
ebook: Good for e-readers (150 dpi) - DEFAULT, balanced size/quality
printer: Good for printing (300 dpi) - higher quality
prepress: Highest quality (300+ dpi) - for professional printing
default: Ghostscript default settings - no optimization
minimal: Exact match to original workflow commands - for compatibility
PDF Compatibility:
1.3: Oldest, most compatible (Acrobat 4.0+)
1.4: Good compatibility (Acrobat 5.0+) - DEFAULT
1.5: Modern features (Acrobat 6.0+)
1.6: Advanced features (Acrobat 7.0+)
1.7: Latest features (Acrobat 8.0+)
"""
)
parser.add_argument(
'--input', '-i',
type=Path,
required=True,
metavar='PDF_FILE',
help='Path to the input PDF file to compress'
)
parser.add_argument(
'--output', '-o',
type=Path,
required=True,
metavar='PDF_FILE',
help='Path for the compressed output PDF file'
)
parser.add_argument(
'--quality', '-q',
choices=['screen', 'ebook', 'printer', 'prepress', 'default', 'minimal'],
default='ebook',
help='Compression quality preset (default: ebook)'
)
parser.add_argument(
'--compatibility', '-c',
choices=['1.3', '1.4', '1.5', '1.6', '1.7'],
default='1.4',
metavar='VERSION',
help='PDF compatibility level (default: 1.4)'
)
parser.add_argument(
'--verbose', '-v',
action='store_true',
help='Enable verbose output with detailed progress information'
)
parser.add_argument(
'--version',
action='version',
version='%(prog)s 1.0.0'
)
return parser
def main() -> int:
"""
Main entry point for the PDF compression tool.
Returns:
Exit code (0 for success, 1 for error)
"""
parser = create_argument_parser()
args = parser.parse_args()
try:
# Create compressor instance
compressor = PDFCompressor(
quality=args.quality,
compatibility=args.compatibility,
verbose=args.verbose
)
# Perform compression
stats = compressor.compress(args.input, args.output)
# Success message
print(f"\n🎉 PDF compression successful!")
print(f"📁 Output: {args.output}")
print(f"💾 Size reduction: {stats['compression_ratio']:.1f}%")
print(f"🎨 Quality preset: {stats['quality_preset']}")
return 0
except KeyboardInterrupt:
print("\n❌ Operation cancelled by user")
return 1
except Exception as e:
print(f"\n❌ Error: {str(e)}")
if args.verbose:
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
sys.exit(main())