Files
cs249r_book/tools/scripts/content/check_duplicate_labels.py

538 lines
22 KiB
Python

#!/usr/bin/env python3
"""
🚫 Find Duplicate Labels in Quarto Files
This script recursively finds all .qmd files in a directory and identifies duplicate labels
(e.g., {#fig-xyz}) that can cause ambiguous cross-reference links in Quarto.
By default checks: figures, tables, sections, listings (the most common types)
Use flags to check other types or restrict to specific types.
Any duplicate label definition (same-file or cross-file) can cause reference confusion.
DESIGN PHILOSOPHY FOR PRE-COMMIT:
- FAIL on any duplicate labels (--strict mode)
- Fast execution for CI/CD workflows
- Clear exit codes: 0 = no duplicates, 1 = duplicates found
- Minimal output for automation (--quiet)
Exits with 0 if no duplicates found; exits with 1 if duplicates exist.
"""
import argparse
import re
import sys
import json
from pathlib import Path
from collections import defaultdict
# Label patterns for DEFINITIONS only (not references)
LABEL_PATTERNS = {
"Figure": [
r'\{#(fig-[\w-]+)', # {#fig-xxx}
r'#\|\s*(?:label|fig-label):\s*(fig-[\w-]+)', # #| label: fig-xxx or #| fig-label: fig-xxx
],
"Table": [
r'\{#(tbl-[\w-]+)', # {#tbl-xxx}
r'#\|\s*(?:label|tbl-label):\s*(tbl-[\w-]+)', # #| label: tbl-xxx
],
"Section": [r'\{#(sec-[\w-]+)'], # {#sec-xxx}
"Equation": [r'\{#(eq-[\w-]+)'], # {#eq-xxx}
"Listing": [
r'\{#(lst-[\w-]+)', # {#lst-xxx}
r'#\|\s*(?:label|lst-label):\s*(lst-[\w-]+)', # #| lst-label: lst-xxx
],
"Video": [r'\{#(vid-[\w-]+)'], # {#vid-xxx}
"Exercise": [r'\{#(exr-[\w-]+)'], # {#exr-xxx}
}
def find_qmd_files(directory: Path):
"""Recursively find all .qmd files in directory."""
if not directory.exists():
raise FileNotFoundError(f"Directory not found: {directory}")
if not directory.is_dir():
raise NotADirectoryError(f"Path is not a directory: {directory}")
qmd_files = list(directory.rglob("*.qmd"))
return sorted(qmd_files) # Sort for consistent output
def is_in_code_block(lines, line_index):
"""Check if the current line is inside a code block."""
in_code_block = False
for i in range(line_index):
line = lines[i].strip()
if line.startswith('```'):
in_code_block = not in_code_block
return in_code_block
def get_format_context(lines, line_index):
"""Get the format context (html/pdf) for a line if it's in a conditional block.
Returns:
str: 'html', 'pdf', or 'default' if not in a conditional block
"""
current_format = 'default'
div_level = 0
for i in range(line_index):
line = lines[i].strip()
if line.startswith(':::'):
if 'when-format="html"' in line:
current_format = 'html'
div_level += 1
elif 'when-format="pdf"' in line:
current_format = 'pdf'
div_level += 1
elif line == ':::':
div_level -= 1
if div_level == 0:
current_format = 'default'
return current_format
def build_label_map(files, label_types):
"""Build a complete map of all label DEFINITIONS found across all files.
Handles conditional format blocks (HTML/PDF) properly - same label in different
format blocks is considered one logical definition.
Returns:
dict: label -> [(file, line_num, label_type, format_context), ...]
"""
label_map = defaultdict(list) # label -> [(file, line_num, label_type, format_context), ...]
file_count = 0
total_labels = 0
for file in files:
try:
content = file.read_text(encoding="utf-8")
lines = content.splitlines()
file_count += 1
except Exception as e:
print(f"⚠️ Warning: Could not read {file}: {e}", file=sys.stderr)
continue
file_labels = 0
for line_num, line in enumerate(lines, 1):
# Skip lines in code blocks
if is_in_code_block(lines, line_num - 1):
continue
# Get format context (html/pdf/default)
format_context = get_format_context(lines, line_num - 1)
for label_type, patterns in label_types.items():
for pattern in patterns:
for match in re.finditer(pattern, line):
label = match.group(1)
label_map[label].append((file, line_num, label_type, format_context))
file_labels += 1
total_labels += 1
return label_map, {"files_processed": file_count, "total_labels": total_labels}
def find_duplicates(label_map):
"""Find labels that have true duplicate definitions.
Same label in HTML and PDF format blocks is NOT considered a duplicate
(it's the same logical definition for different output formats).
Args:
label_map: Dictionary of label -> [(file, line_num, label_type, format_context), ...]
Returns:
Dictionary of duplicate labels and their locations
"""
duplicates = {}
for label, locations in label_map.items():
if len(locations) <= 1:
continue
# Group by (file, format_context) to identify true duplicates
unique_definitions = set()
for file, line_num, label_type, format_context in locations:
unique_definitions.add((file, format_context))
# Check for true duplicates
true_duplicates = []
# Case 1: Same file, same format context -> duplicate
file_format_groups = defaultdict(list)
for file, line_num, label_type, format_context in locations:
file_format_groups[(file, format_context)].append((file, line_num, label_type, format_context))
for (file, format_context), group_locations in file_format_groups.items():
if len(group_locations) > 1:
# Multiple definitions in same file with same format context = duplicate
true_duplicates.extend(group_locations)
# Case 2: Different files (regardless of format) -> duplicate
files_involved = set(loc[0] for loc in locations)
if len(files_involved) > 1:
# Add all cross-file occurrences as duplicates
true_duplicates = locations
# Case 3: Same file but BOTH have 'default' format (not in conditional blocks) -> duplicate
default_in_same_file = [loc for loc in locations if loc[3] == 'default']
file_groups = defaultdict(list)
for loc in default_in_same_file:
file_groups[loc[0]].append(loc)
for file, file_locs in file_groups.items():
if len(file_locs) > 1:
true_duplicates.extend(file_locs)
if true_duplicates:
# Remove duplicates from the list while preserving order
seen = set()
unique_true_duplicates = []
for loc in true_duplicates:
loc_key = (loc[0], loc[1]) # (file, line_num)
if loc_key not in seen:
seen.add(loc_key)
unique_true_duplicates.append(loc)
if len(unique_true_duplicates) > 1:
duplicates[label] = unique_true_duplicates
return duplicates
def report_duplicates(duplicates, stats=None, quiet=False, format_type="text"):
"""Report duplicate labels found.
Args:
duplicates: Dictionary of duplicate labels
stats: Statistics about processing
quiet: If True, minimal output
format_type: "text", "json", or "summary"
Returns:
True if no duplicates, False if duplicates found
"""
if not duplicates:
if not quiet and format_type == "text":
if stats:
print(f"✅ No duplicate labels found! Processed {stats['files_processed']} files, {stats['total_labels']} labels.")
else:
print("✅ No duplicate labels found!")
return True
if format_type == "json":
# JSON output for automation
result = {
"status": "error",
"duplicate_count": len(duplicates),
"stats": stats or {},
"duplicates": {}
}
for label, locations in duplicates.items():
result["duplicates"][label] = []
for file, line_num, label_type, format_context in locations:
result["duplicates"][label].append({
"file": str(file),
"line": line_num,
"type": label_type,
"format_context": format_context
})
print(json.dumps(result, indent=2))
return False
elif format_type == "summary":
# Brief summary for pre-commit
cross_file_count = sum(1 for label, locs in duplicates.items()
if len(set(loc[0] for loc in locs)) > 1)
same_file_count = len(duplicates) - cross_file_count
print(f"❌ DUPLICATE LABELS DETECTED:")
print(f"{cross_file_count} cross-file duplicates")
print(f"{same_file_count} same-file duplicates")
print(f" • Total: {len(duplicates)} duplicate labels")
if stats:
print(f" • Processed: {stats['files_processed']} files, {stats['total_labels']} labels")
print(f"\n💡 Run: python3 scripts/find_duplicate_labels.py -d <directory> --details")
print(f" to see specific locations and fix suggestions.")
return False
else: # text format (default)
if quiet:
# Minimal output: just warnings with icons for problematic labels and files
for label, locations in sorted(duplicates.items()):
print(f"🚫 {label}")
for file, line_num, label_type, format_context in sorted(locations):
try:
rel_path = file.relative_to(Path.cwd())
except ValueError:
rel_path = file.resolve()
context_info = f" ({format_context})" if format_context != 'default' else ""
print(f" 📍 {rel_path}:{line_num}{context_info}")
return False
else:
# Detailed output (default)
print("🚫 Duplicate labels detected:\n")
for label, locations in sorted(duplicates.items()):
files_involved = set(loc[0] for loc in locations)
if len(files_involved) > 1:
print(f"❌ Label '{label}' appears in {len(files_involved)} different files:")
else:
print(f"❌ Label '{label}' appears {len(locations)} times in same file:")
for file, line_num, label_type, format_context in sorted(locations):
try:
rel_path = file.relative_to(Path.cwd())
except ValueError:
rel_path = file.resolve()
context_info = f" ({format_context})" if format_context != 'default' else ""
print(f" 📍 {label_type:<10}: {rel_path}:{line_num}{context_info}")
print() # Empty line for readability
print(f"💥 Found {len(duplicates)} duplicate labels!")
print(f"⚠️ These duplicates can cause ambiguous cross-reference links!")
if stats:
print(f"📊 Processed {stats['files_processed']} files with {stats['total_labels']} total labels")
print("\n🔧 To fix these issues:")
print(" 1. Rename one of the duplicate labels in each conflict")
print(" 2. Update any cross-references (@label) to use the new names")
print(" 3. Ensure each label is unique across your entire project")
return False
def generate_suggestions(duplicates):
"""Generate suggestions for fixing duplicate labels."""
if not duplicates:
return
print("\n💡 Suggested fixes:")
print("=" * 50)
for label, locations in sorted(duplicates.items()):
print(f"\nFor label '{label}':")
for i, (file, line_num, label_type, format_context) in enumerate(sorted(locations)):
chapter_name = file.parent.name if file.parent.name != 'core' else file.stem
suggested_label = f"{label}-{chapter_name}"
try:
rel_path = file.relative_to(Path.cwd())
except ValueError:
rel_path = file.resolve()
context_info = f" ({format_context})" if format_context != 'default' else ""
print(f" 📝 In {rel_path}:{line_num}{context_info}")
print(f" Change: {{#{label}}}{{#{suggested_label}}}")
print(f" Update references: @{label} → @{suggested_label}")
def create_precommit_config():
"""Generate a sample pre-commit configuration."""
config = """
# Add to .pre-commit-config.yaml
repos:
- repo: local
hooks:
- id: check-duplicate-labels
name: Check for duplicate Quarto labels
entry: python3 scripts/find_duplicate_labels.py
args: ['-d', 'contents/core/', '--figures', '--tables', '--listings', '--quiet', '--strict']
language: system
files: '\\.qmd$'
pass_filenames: false
"""
return config.strip()
def parse_args():
parser = argparse.ArgumentParser(
description="Find duplicate labels across .qmd files that could cause wrong cross-reference links.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Pre-commit usage (focused on critical types)
python3 find_duplicate_labels.py -d contents/core/ --figures --tables --listings --quiet --strict
# Check only figures and tables
python3 find_duplicate_labels.py -d contents/core/ --figures --tables
# Check only figures
python3 find_duplicate_labels.py -d contents/core/ --figures-only
# Check all label types
python3 find_duplicate_labels.py -d contents/core/ --all-types
# Development usage with suggestions
python3 find_duplicate_labels.py -d contents/core/ --suggestions
# JSON output for automation
python3 find_duplicate_labels.py -d contents/core/ --format json
PRE-COMMIT INTEGRATION:
python3 find_duplicate_labels.py -d contents/core/ --figures --tables --listings --quiet --strict
Exit code 0 = no duplicates, 1 = duplicates found
Add to .pre-commit-config.yaml:
- repo: local
hooks:
- id: check-duplicate-labels
name: Check duplicate Quarto labels
entry: python3 scripts/find_duplicate_labels.py
args: ['-d', 'contents/core/', '--figures', '--tables', '--listings', '--quiet', '--strict']
language: system
files: '\\.qmd$'
pass_filenames: false
Duplicate Label Issues Fixed:
- Multiple files with {#fig-architecture} → Wrong @fig-architecture links
- Duplicate {#tbl-results} across chapters → Ambiguous table references
- Same {#sec-introduction} in multiple files → Broken section links
"""
)
# Main input argument
parser.add_argument("-d", "--dir", type=Path, required=False,
help="Directory to search for .qmd files (searches recursively)")
# Type-specific checks (by default: figures, tables, sections, listings)
parser.add_argument("--figures", action="store_true", help="Check figures (default: enabled)")
parser.add_argument("--tables", action="store_true", help="Check tables (default: enabled)")
parser.add_argument("--sections", action="store_true", help="Check sections (default: enabled)")
parser.add_argument("--listings", action="store_true", help="Check listings (default: enabled)")
parser.add_argument("--equations", action="store_true", help="Check equations (default: disabled)")
parser.add_argument("--videos", action="store_true", help="Check videos (default: disabled)")
parser.add_argument("--exercises", action="store_true", help="Check exercises (default: disabled)")
# Convenience flags
parser.add_argument("--all-types", action="store_true", help="Check all label types")
parser.add_argument("--figures-only", action="store_true", help="Check figures only")
parser.add_argument("--tables-only", action="store_true", help="Check tables only")
parser.add_argument("--sections-only", action="store_true", help="Check sections only")
parser.add_argument("--listings-only", action="store_true", help="Check listings only")
# Detection mode
parser.add_argument("--strict", action="store_true", default=True,
help="FAIL on any duplicates (exit code 1) - recommended for pre-commit")
# Output options
parser.add_argument("--format", choices=["text", "json", "summary"], default="text",
help="Output format: text (detailed), json (machine-readable), summary (brief)")
parser.add_argument("--quiet", action="store_true",
help="Minimal output: just print warnings with icons for problematic labels and files")
parser.add_argument("--details", action="store_true",
help="Show detailed output (opposite of --quiet)")
parser.add_argument("--suggestions", action="store_true",
help="Generate suggested fixes for duplicate labels")
parser.add_argument("--precommit-config", action="store_true",
help="Show sample pre-commit configuration")
return parser.parse_args()
def main():
args = parse_args()
# Handle special cases
if args.precommit_config:
print(create_precommit_config())
return 0
# Directory is required for all other operations
if not args.dir:
parser = argparse.ArgumentParser()
parser.error("the following arguments are required: -d/--dir")
# Check for all duplicate label definitions
# Determine output settings
quiet = args.quiet and not args.details
# Determine which label types to check
label_types = {}
# Handle convenience flags first
if args.figures_only:
label_types["Figure"] = LABEL_PATTERNS["Figure"]
elif args.tables_only:
label_types["Table"] = LABEL_PATTERNS["Table"]
elif args.sections_only:
label_types["Section"] = LABEL_PATTERNS["Section"]
elif args.listings_only:
label_types["Listing"] = LABEL_PATTERNS["Listing"]
elif args.all_types:
label_types = LABEL_PATTERNS
else:
# Default behavior or explicit type selection
default_types = ["figures", "tables", "sections", "listings"]
# Check if any explicit type flags were used
explicit_flags = any([args.figures, args.tables, args.sections, args.listings,
args.equations, args.videos, args.exercises])
if explicit_flags:
# Use only explicitly enabled types
if args.figures: label_types["Figure"] = LABEL_PATTERNS["Figure"]
if args.tables: label_types["Table"] = LABEL_PATTERNS["Table"]
if args.sections: label_types["Section"] = LABEL_PATTERNS["Section"]
if args.listings: label_types["Listing"] = LABEL_PATTERNS["Listing"]
if args.equations: label_types["Equation"] = LABEL_PATTERNS["Equation"]
if args.videos: label_types["Video"] = LABEL_PATTERNS["Video"]
if args.exercises: label_types["Exercise"] = LABEL_PATTERNS["Exercise"]
else:
# Use defaults: figures, tables, sections, listings
label_types = {
"Figure": LABEL_PATTERNS["Figure"],
"Table": LABEL_PATTERNS["Table"],
"Section": LABEL_PATTERNS["Section"],
"Listing": LABEL_PATTERNS["Listing"]
}
# Find all .qmd files in directory
try:
qmd_files = find_qmd_files(args.dir)
except (FileNotFoundError, NotADirectoryError) as e:
print(f"❌ Error: {e}", file=sys.stderr)
sys.exit(1)
if not qmd_files:
if not quiet:
print(f"❌ No .qmd files found in {args.dir}", file=sys.stderr)
sys.exit(1)
if not quiet and args.format == "text":
checked_types = ", ".join(label_types.keys())
print(f"🔍 Scanning {len(qmd_files)} .qmd files in {args.dir}")
print(f"🏷️ Checking: {checked_types}")
# Build complete label map across all files
label_map, stats = build_label_map(qmd_files, label_types)
if not quiet and args.format == "text":
print(f"📊 Found {stats['total_labels']} labels across {stats['files_processed']} files")
# Find and report duplicates
duplicates = find_duplicates(label_map)
success = report_duplicates(duplicates, stats=stats, quiet=quiet, format_type=args.format)
# Generate suggestions if requested
if args.suggestions and duplicates and args.format == "text":
generate_suggestions(duplicates)
# Print final status for text format
if not quiet and args.format == "text":
if success:
print("\n✅ All labels are unique! No duplicate label conflicts found.")
else:
print(f"\n❌ Found duplicate labels that could cause wrong cross-reference links!")
if not args.suggestions:
print(" Run with --suggestions flag to get fix recommendations.")
# Exit with appropriate code for pre-commit
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()