mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 17:20:21 -05:00
feat: Add comprehensive EPUB validator with epubcheck integration
- Create validate_epub.py utility for EPUB validation - Integrates official epubcheck validator when available - Custom checks for CSS variables and XML comment violations - Detects common XHTML errors (unclosed tags, unescaped characters) - Validates EPUB structure (mimetype, container.xml, OPF) - Supports --quick flag to skip epubcheck for faster validation - Provides detailed error reporting with file paths and line numbers
This commit is contained in:
409
tools/scripts/utilities/validate_epub.py
Executable file
409
tools/scripts/utilities/validate_epub.py
Executable file
@@ -0,0 +1,409 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
EPUB Validator Script
|
||||
|
||||
Validates EPUB files for common issues including:
|
||||
- XML parsing errors (double-hyphen in comments)
|
||||
- CSS variable issues (--variable syntax)
|
||||
- Malformed HTML/XHTML
|
||||
- Missing required files
|
||||
- Structural validation
|
||||
|
||||
Uses epubcheck (official EPUB validator) if available, with custom checks for project-specific issues.
|
||||
|
||||
Installation:
|
||||
# Install epubcheck (recommended)
|
||||
brew install epubcheck # macOS
|
||||
# OR download from: https://github.com/w3c/epubcheck/releases
|
||||
|
||||
Usage:
|
||||
python3 validate_epub.py <path_to_epub_file>
|
||||
python3 validate_epub.py quarto/_build/epub/Machine-Learning-Systems.epub
|
||||
python3 validate_epub.py --quick <path_to_epub_file> # Skip epubcheck
|
||||
"""
|
||||
|
||||
import sys
|
||||
import zipfile
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Dict
|
||||
import tempfile
|
||||
import shutil
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
|
||||
class EPUBValidator:
|
||||
"""Validates EPUB files for common issues."""
|
||||
|
||||
def __init__(self, epub_path: str, use_epubcheck: bool = True):
|
||||
self.epub_path = Path(epub_path)
|
||||
self.errors: List[Tuple[str, str, str]] = [] # (severity, category, message)
|
||||
self.warnings: List[Tuple[str, str, str]] = []
|
||||
self.temp_dir = None
|
||||
self.use_epubcheck = use_epubcheck
|
||||
|
||||
def validate(self) -> bool:
|
||||
"""Run all validation checks. Returns True if no errors found."""
|
||||
print(f"🔍 Validating EPUB: {self.epub_path.name}\n")
|
||||
|
||||
if not self.epub_path.exists():
|
||||
self._add_error("CRITICAL", "File", f"EPUB file not found: {self.epub_path}")
|
||||
return False
|
||||
|
||||
# Run epubcheck first if available
|
||||
if self.use_epubcheck:
|
||||
self._run_epubcheck()
|
||||
|
||||
# Extract EPUB to temp directory
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
try:
|
||||
with zipfile.ZipFile(self.epub_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(self.temp_dir)
|
||||
except zipfile.BadZipFile:
|
||||
self._add_error("CRITICAL", "Structure", "Invalid ZIP/EPUB file")
|
||||
return False
|
||||
|
||||
# Run custom validation checks (project-specific)
|
||||
print("\n📋 Running custom validation checks...")
|
||||
self._check_mimetype()
|
||||
self._check_container_xml()
|
||||
self._check_css_variables()
|
||||
self._check_xml_comments()
|
||||
self._check_common_xhtml_errors()
|
||||
self._check_xhtml_validity()
|
||||
self._check_opf_structure()
|
||||
|
||||
# Print results
|
||||
self._print_results()
|
||||
|
||||
# Cleanup
|
||||
if self.temp_dir:
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
return len(self.errors) == 0
|
||||
|
||||
def _add_error(self, severity: str, category: str, message: str):
|
||||
"""Add an error to the list."""
|
||||
self.errors.append((severity, category, message))
|
||||
|
||||
def _add_warning(self, severity: str, category: str, message: str):
|
||||
"""Add a warning to the list."""
|
||||
self.warnings.append((severity, category, message))
|
||||
|
||||
def _run_epubcheck(self):
|
||||
"""Run epubcheck validator if available."""
|
||||
print("🔧 Running epubcheck (official EPUB validator)...\n")
|
||||
|
||||
try:
|
||||
# Try to run epubcheck
|
||||
result = subprocess.run(
|
||||
['epubcheck', '--json', '-', str(self.epub_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print("✅ epubcheck: PASS\n")
|
||||
return
|
||||
|
||||
# Parse JSON output
|
||||
try:
|
||||
output = json.loads(result.stdout) if result.stdout else {}
|
||||
messages = output.get('messages', [])
|
||||
|
||||
error_count = 0
|
||||
warning_count = 0
|
||||
|
||||
for msg in messages:
|
||||
severity = msg.get('severity', 'INFO')
|
||||
message_text = msg.get('message', 'Unknown error')
|
||||
locations = msg.get('locations', [])
|
||||
|
||||
location_str = ""
|
||||
if locations:
|
||||
loc = locations[0]
|
||||
path = loc.get('path', '')
|
||||
line = loc.get('line', '')
|
||||
col = loc.get('column', '')
|
||||
location_str = f"{path}:{line}:{col}" if line else path
|
||||
|
||||
full_message = f"{location_str}: {message_text}" if location_str else message_text
|
||||
|
||||
if severity == 'ERROR' or severity == 'FATAL':
|
||||
self._add_error("ERROR", "epubcheck", full_message)
|
||||
error_count += 1
|
||||
elif severity == 'WARNING':
|
||||
self._add_warning("WARNING", "epubcheck", full_message)
|
||||
warning_count += 1
|
||||
|
||||
print(f"❌ epubcheck found {error_count} errors, {warning_count} warnings\n")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
# Fallback to text parsing
|
||||
if result.stderr:
|
||||
print(f"⚠️ epubcheck output (text mode):\n{result.stderr}\n")
|
||||
self._add_warning("WARNING", "epubcheck", "Could not parse JSON output")
|
||||
|
||||
except FileNotFoundError:
|
||||
print("⚠️ epubcheck not found. Install with: brew install epubcheck")
|
||||
print(" Skipping official EPUB validation.\n")
|
||||
except subprocess.TimeoutExpired:
|
||||
self._add_error("ERROR", "epubcheck", "Validation timed out after 120 seconds")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Could not run epubcheck: {e}\n")
|
||||
|
||||
def _check_mimetype(self):
|
||||
"""Check for valid mimetype file."""
|
||||
mimetype_path = Path(self.temp_dir) / "mimetype"
|
||||
if not mimetype_path.exists():
|
||||
self._add_error("ERROR", "Structure", "Missing mimetype file")
|
||||
return
|
||||
|
||||
content = mimetype_path.read_text().strip()
|
||||
if content != "application/epub+zip":
|
||||
self._add_error("ERROR", "Structure", f"Invalid mimetype: {content}")
|
||||
|
||||
def _check_container_xml(self):
|
||||
"""Check for valid META-INF/container.xml."""
|
||||
container_path = Path(self.temp_dir) / "META-INF" / "container.xml"
|
||||
if not container_path.exists():
|
||||
self._add_error("ERROR", "Structure", "Missing META-INF/container.xml")
|
||||
return
|
||||
|
||||
try:
|
||||
tree = ET.parse(container_path)
|
||||
root = tree.getroot()
|
||||
# Check for rootfile element
|
||||
rootfiles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile")
|
||||
if not rootfiles:
|
||||
self._add_error("ERROR", "Structure", "No rootfile found in container.xml")
|
||||
except ET.ParseError as e:
|
||||
self._add_error("ERROR", "XML", f"Invalid container.xml: {e}")
|
||||
|
||||
def _check_css_variables(self):
|
||||
"""Check CSS files for problematic CSS custom properties."""
|
||||
print("📝 Checking CSS files for CSS variables...")
|
||||
|
||||
css_files = list(Path(self.temp_dir).rglob("*.css"))
|
||||
|
||||
for css_file in css_files:
|
||||
rel_path = css_file.relative_to(self.temp_dir)
|
||||
content = css_file.read_text()
|
||||
|
||||
# Check for CSS variable declarations (--variable-name)
|
||||
var_declarations = re.findall(r'^\s*(--[\w-]+)\s*:', content, re.MULTILINE)
|
||||
if var_declarations:
|
||||
self._add_error("ERROR", "CSS",
|
||||
f"{rel_path}: Found CSS variable declarations: {', '.join(var_declarations[:5])}")
|
||||
|
||||
# Check for CSS variable usage (var(--variable-name))
|
||||
var_usage = re.findall(r'var\((--[\w-]+)\)', content)
|
||||
if var_usage:
|
||||
self._add_error("ERROR", "CSS",
|
||||
f"{rel_path}: Found CSS variable usage: {', '.join(set(var_usage[:5]))}")
|
||||
|
||||
# Count total double-hyphens (for reference)
|
||||
double_hyphen_count = content.count('--')
|
||||
if double_hyphen_count > 0:
|
||||
# Check if they're only in comments
|
||||
without_comments = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
|
||||
double_hyphens_in_code = without_comments.count('--')
|
||||
|
||||
if double_hyphens_in_code > 0:
|
||||
self._add_warning("WARNING", "CSS",
|
||||
f"{rel_path}: Found {double_hyphens_in_code} double-hyphens outside comments")
|
||||
else:
|
||||
print(f" ✓ {rel_path}: {double_hyphen_count} double-hyphens (all in comments)")
|
||||
|
||||
def _check_xml_comments(self):
|
||||
"""Check for XML comment violations (double-hyphen in comments)."""
|
||||
print("\n📝 Checking for XML comment violations...")
|
||||
|
||||
xml_files = list(Path(self.temp_dir).rglob("*.xhtml")) + \
|
||||
list(Path(self.temp_dir).rglob("*.xml")) + \
|
||||
list(Path(self.temp_dir).rglob("*.opf"))
|
||||
|
||||
# Pattern to find comments with double-hyphens inside them
|
||||
# XML spec prohibits -- inside comments
|
||||
comment_pattern = re.compile(r'<!--.*?--.*?-->', re.DOTALL)
|
||||
|
||||
for xml_file in xml_files:
|
||||
rel_path = xml_file.relative_to(self.temp_dir)
|
||||
try:
|
||||
content = xml_file.read_text()
|
||||
matches = comment_pattern.findall(content)
|
||||
|
||||
if matches:
|
||||
# Find line numbers
|
||||
lines = content.split('\n')
|
||||
for i, line in enumerate(lines, 1):
|
||||
if '--' in line and '<!--' in content[:content.index(line) if line in content else 0]:
|
||||
self._add_error("ERROR", "XML",
|
||||
f"{rel_path}:{i}: Comment contains '--' (double-hyphen)")
|
||||
except Exception as e:
|
||||
self._add_warning("WARNING", "XML", f"{rel_path}: Could not check comments: {e}")
|
||||
|
||||
def _check_common_xhtml_errors(self):
|
||||
"""Check for common XHTML/XML errors that plague EPUB files."""
|
||||
print("\n📝 Checking for common XHTML errors...")
|
||||
|
||||
xhtml_files = list(Path(self.temp_dir).rglob("*.xhtml"))
|
||||
|
||||
for xhtml_file in xhtml_files:
|
||||
rel_path = xhtml_file.relative_to(self.temp_dir)
|
||||
try:
|
||||
content = xhtml_file.read_text()
|
||||
lines = content.split('\n')
|
||||
|
||||
for i, line in enumerate(lines, 1):
|
||||
# Check for unclosed tags (common patterns)
|
||||
if '<br>' in line and '<br/>' not in line and '<br />' not in line:
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}:{i}: Use self-closing <br/> instead of <br>")
|
||||
|
||||
if '<img ' in line and not '/>' in line[line.index('<img '):]:
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}:{i}: <img> tag should be self-closing")
|
||||
|
||||
if '<hr>' in line and '<hr/>' not in line and '<hr />' not in line:
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}:{i}: Use self-closing <hr/> instead of <hr>")
|
||||
|
||||
# Check for unescaped ampersands (except entities)
|
||||
if '&' in line:
|
||||
# Simple check for unescaped &
|
||||
if re.search(r'&(?![a-zA-Z]+;|#\d+;|#x[0-9a-fA-F]+;)', line):
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}:{i}: Possibly unescaped ampersand (&)")
|
||||
|
||||
# Check for < > without proper escaping
|
||||
if re.search(r'<(?![a-zA-Z/!?])', line):
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}:{i}: Possibly unescaped < character")
|
||||
|
||||
# Check for attributes without quotes
|
||||
if re.search(r'<\w+[^>]*\s+\w+=\w+[^"\']', line):
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}:{i}: Attribute values should be quoted")
|
||||
|
||||
except Exception as e:
|
||||
self._add_warning("WARNING", "XHTML",
|
||||
f"{rel_path}: Could not check for common errors: {e}")
|
||||
|
||||
def _check_xhtml_validity(self):
|
||||
"""Check XHTML files for basic validity."""
|
||||
print("\n📝 Checking XHTML validity...")
|
||||
|
||||
xhtml_files = list(Path(self.temp_dir).rglob("*.xhtml"))
|
||||
|
||||
for xhtml_file in xhtml_files:
|
||||
rel_path = xhtml_file.relative_to(self.temp_dir)
|
||||
try:
|
||||
# Try to parse as XML (XHTML should be well-formed XML)
|
||||
ET.parse(xhtml_file)
|
||||
print(f" ✓ {rel_path}: Valid XHTML")
|
||||
except ET.ParseError as e:
|
||||
self._add_error("ERROR", "XHTML", f"{rel_path}: Parse error - {e}")
|
||||
|
||||
def _check_opf_structure(self):
|
||||
"""Check OPF file structure."""
|
||||
print("\n📝 Checking OPF structure...")
|
||||
|
||||
opf_files = list(Path(self.temp_dir).rglob("*.opf"))
|
||||
|
||||
if not opf_files:
|
||||
self._add_error("ERROR", "Structure", "No OPF file found")
|
||||
return
|
||||
|
||||
for opf_file in opf_files:
|
||||
rel_path = opf_file.relative_to(self.temp_dir)
|
||||
try:
|
||||
tree = ET.parse(opf_file)
|
||||
root = tree.getroot()
|
||||
|
||||
# Check for required elements
|
||||
namespaces = {'opf': 'http://www.idpf.org/2007/opf'}
|
||||
|
||||
metadata = root.find('.//opf:metadata', namespaces)
|
||||
manifest = root.find('.//opf:manifest', namespaces)
|
||||
spine = root.find('.//opf:spine', namespaces)
|
||||
|
||||
if metadata is None:
|
||||
self._add_error("ERROR", "OPF", f"{rel_path}: Missing metadata element")
|
||||
if manifest is None:
|
||||
self._add_error("ERROR", "OPF", f"{rel_path}: Missing manifest element")
|
||||
if spine is None:
|
||||
self._add_error("ERROR", "OPF", f"{rel_path}: Missing spine element")
|
||||
else:
|
||||
print(f" ✓ {rel_path}: Valid OPF structure")
|
||||
|
||||
except ET.ParseError as e:
|
||||
self._add_error("ERROR", "OPF", f"{rel_path}: Parse error - {e}")
|
||||
|
||||
def _print_results(self):
|
||||
"""Print validation results."""
|
||||
print("\n" + "="*70)
|
||||
print("📊 VALIDATION RESULTS")
|
||||
print("="*70)
|
||||
|
||||
if not self.errors and not self.warnings:
|
||||
print("\n✅ SUCCESS: No issues found!")
|
||||
print(f" {self.epub_path.name} is valid")
|
||||
return
|
||||
|
||||
if self.errors:
|
||||
print(f"\n❌ ERRORS FOUND: {len(self.errors)}")
|
||||
print("-" * 70)
|
||||
for severity, category, message in self.errors:
|
||||
print(f" [{severity}] [{category}] {message}")
|
||||
|
||||
if self.warnings:
|
||||
print(f"\n⚠️ WARNINGS: {len(self.warnings)}")
|
||||
print("-" * 70)
|
||||
for severity, category, message in self.warnings:
|
||||
print(f" [{severity}] [{category}] {message}")
|
||||
|
||||
print("\n" + "="*70)
|
||||
if self.errors:
|
||||
print("❌ VALIDATION FAILED")
|
||||
else:
|
||||
print("✅ VALIDATION PASSED (with warnings)")
|
||||
print("="*70)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python3 validate_epub.py [--quick] <path_to_epub_file>")
|
||||
print("\nOptions:")
|
||||
print(" --quick Skip epubcheck validation (faster, custom checks only)")
|
||||
print("\nExamples:")
|
||||
print(" python3 validate_epub.py quarto/_build/epub/Machine-Learning-Systems.epub")
|
||||
print(" python3 validate_epub.py --quick quarto/_build/epub/Machine-Learning-Systems.epub")
|
||||
sys.exit(1)
|
||||
|
||||
# Parse arguments
|
||||
use_epubcheck = True
|
||||
epub_path = None
|
||||
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == '--quick':
|
||||
use_epubcheck = False
|
||||
elif not epub_path:
|
||||
epub_path = arg
|
||||
|
||||
if not epub_path:
|
||||
print("Error: No EPUB file specified")
|
||||
sys.exit(1)
|
||||
|
||||
validator = EPUBValidator(epub_path, use_epubcheck=use_epubcheck)
|
||||
|
||||
success = validator.validate()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user