Files
cs249r_book/book/tools/scripts/content/manage_captions.py
Vijay Janapa Reddi 9781727d60 refactor: rename advanced_intro to introduction and update scripts
- Renamed vol2/advanced_intro to vol2/introduction for consistency
- Updated all scripts and configs to use vol1/ instead of core/
- Updated pre-commit config to check all contents/ not just vol1/
- Updated path references in Lua filters, Python scripts, and configs
2026-01-01 14:46:52 -05:00

4156 lines
177 KiB
Python
Executable File
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Figure Caption Improvement Script
A streamlined tool for improving figure and table captions in Quarto-based textbooks
using local Ollama LLM models with strong, educational language.
Main Modes:
1. --improve/-i: LLM caption improvement and file updates (default)
2. --build-map/-b: Build content map from QMD files and save to JSON
3. --analyze/-a: Quality analysis and file structure validation
4. --repair/-r: Fix formatting issues only (no LLM)
Features:
- Follows _quarto.yml chapter ordering
- 100% extraction success (270 figures, 92 tables)
- Strong language improvements (removes weak starters)
- Proper formatting (spacing, capitalization, table prefixes)
- Context-aware processing with paragraph-level analysis
"""
import argparse
import base64
import json
import os
import re
import requests
import subprocess
import yaml
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from titlecase import titlecase
import pypandoc
class CaptionQualityChecker:
"""Analyzes caption quality and identifies issues."""
def __init__(self):
self.quality_rules = {
'missing_punctuation': self._check_punctuation,
'poor_capitalization': self._check_capitalization,
'too_generic': self._check_generic,
'missing_bold_pattern': self._check_bold_pattern,
'broken_formatting': self._check_formatting
}
def _check_punctuation(self, caption: str) -> Tuple[bool, str]:
"""Check if caption ends with proper punctuation."""
if not caption or caption.strip().endswith(('.', '!', '?')):
return True, ""
return False, "Missing period"
def _check_capitalization(self, caption: str) -> Tuple[bool, str]:
"""Check if capitalization follows style guide."""
if not caption:
return True, ""
# Check for obvious issues like all caps or all lowercase
if caption.isupper():
return False, "All caps"
if caption.islower() and not caption.startswith('**'):
return False, "All lowercase"
# More sophisticated checks could be added here
return True, ""
def _check_generic(self, caption: str) -> Tuple[bool, str]:
"""Check for overly generic captions."""
if not caption:
return True, ""
generic_patterns = [
r'^Figure shows',
r'^Figure \d+',
r'^Table shows',
r'^Table \d+',
r'^Diagram of',
r'^Image of',
r'^Screenshot',
r'^Example$',
r'^Overview$',
r'^Comparison$',
]
for pattern in generic_patterns:
if re.search(pattern, caption, re.IGNORECASE):
return False, "Too generic"
return True, ""
def _check_bold_pattern(self, caption: str) -> Tuple[bool, str]:
"""Check if caption follows **Concept**: explanation pattern."""
if not caption:
return False, "Empty caption"
caption = caption.strip()
# Check for **Bold**: explanation pattern
bold_pattern = r'^\*\*[^*]+\*\*:'
if not re.match(bold_pattern, caption):
return False, "Missing **Bold**: explanation format"
# Check minimum length even with format
if len(caption) < 20:
return False, "Too short even with proper format"
return True, ""
def _check_formatting(self, caption: str) -> Tuple[bool, str]:
"""Check for broken markdown formatting."""
if not caption:
return True, ""
# Check for unmatched bold markers
bold_count = caption.count('**')
if bold_count % 2 != 0:
return False, "Unmatched ** markers"
# Check for other formatting issues
if '{{' in caption or '}}' in caption:
return False, "LaTeX artifacts"
return True, ""
def analyze_caption(self, caption: str) -> Dict[str, any]:
"""Analyze a single caption and return quality report."""
issues = []
suggestions = []
for rule_name, rule_func in self.quality_rules.items():
is_good, issue_desc = rule_func(caption)
if not is_good:
issues.append({
'type': rule_name,
'description': issue_desc
})
return {
'caption': caption,
'issues': issues,
'needs_repair': len(issues) > 0,
'suggestions': suggestions
}
class FigureCaptionImprover:
"""
Main class for improving figure and table captions using local Ollama LLM models.
Provides streamlined modes:
- build_map: Extract content structure and save to JSON
- analyze: Quality analysis and file validation
- repair: Fix formatting issues only
- improve: Complete LLM caption improvement (default)
Features:
- Follows _quarto.yml chapter ordering
- 100% extraction success rate (270 figures, 92 tables)
- Strong language improvements with educational focus
- Proper formatting and spacing normalization
- Context-aware processing with retry logic
"""
def __init__(self, model_name="qwen2.5:7b"):
self.model_name = model_name
self.figure_pattern = re.compile(r'@fig-([a-zA-Z0-9_-]+)')
self.stats = {
'files_processed': 0,
'figures_found': 0,
'figures_improved': 0,
'tables_found': 0,
'tables_improved': 0,
'images_found': 0,
'images_missing': 0,
'json_success': 0,
'json_failed': 0,
'errors': []
}
self.content_map_file = "content_map.json"
self.quality_checker = CaptionQualityChecker()
self.quarto_config_file = "_quarto.yml"
def find_qmd_files(self, directory: str) -> List[Path]:
"""Find all .qmd files in a directory recursively."""
directory_path = Path(directory)
return list(directory_path.rglob("*.qmd"))
def get_book_chapters_from_quarto(self) -> Dict[str, List[str]]:
"""Parse _quarto.yml and return active and commented chapter files."""
if not os.path.exists(self.quarto_config_file):
print(f"❌ Quarto config not found: {self.quarto_config_file}")
return {'active': [], 'commented': []}
try:
with open(self.quarto_config_file, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
chapters = config.get('book', {}).get('chapters', [])
active_chapters = []
for chapter in chapters:
# Handle different chapter formats
if isinstance(chapter, str):
# Simple string chapter (e.g., "index.qmd")
if chapter.endswith('.qmd'):
active_chapters.append(chapter)
elif isinstance(chapter, dict):
# Part or complex chapter structure
if 'part' in chapter and chapter['part'].endswith('.qmd'):
active_chapters.append(chapter['part'])
# Could add more complex handling here if needed
# Also read the raw file to find commented chapters
with open(self.quarto_config_file, 'r', encoding='utf-8') as f:
raw_content = f.read()
# Find commented chapter lines
commented_chapters = []
for line in raw_content.split('\n'):
line = line.strip()
if line.startswith('# - ') and line.endswith('.qmd'):
# Remove '# - ' prefix and clean up
commented_chapter = line[4:].strip()
commented_chapters.append(commented_chapter)
print(f"📚 Found {len(active_chapters)} active chapters, {len(commented_chapters)} commented chapters")
return {
'active': active_chapters,
'commented': commented_chapters
}
except Exception as e:
print(f"❌ Error parsing {self.quarto_config_file}: {e}")
return {'active': [], 'commented': []}
def find_qmd_files_in_order(self, directories: List[str]) -> List[Path]:
"""Find QMD files following the book's chapter order from _quarto.yml."""
book_structure = self.get_book_chapters_from_quarto()
active_chapters = book_structure.get('active', [])
if not active_chapters:
print("⚠️ No book structure found, falling back to directory scan")
# Fallback to original method
all_files = []
for directory in directories:
all_files.extend(self.find_qmd_files(directory))
return all_files
# Filter book chapters to only those in specified directories
filtered_chapters = []
directory_set = {os.path.normpath(d) for d in directories}
for chapter_path in active_chapters:
chapter_full_path = Path(chapter_path)
# Check if this chapter is within any of the specified directories
for directory in directory_set:
try:
# Try to see if chapter is within directory
chapter_full_path.relative_to(directory)
if chapter_full_path.exists():
filtered_chapters.append(chapter_full_path)
break
except ValueError:
# Not within this directory, continue
continue
print(f"📖 Processing {len(filtered_chapters)} chapters in book order")
return filtered_chapters
def check_commented_chapters_in_directories(self, directories: List[str]) -> Dict:
"""Check if any chapters are commented out within the target directories."""
book_structure = self.get_book_chapters_from_quarto()
commented_chapters = book_structure.get('commented', [])
# Normalize directory paths
directory_set = {os.path.normpath(d) for d in directories}
issues = {
'commented_in_target_dirs': [],
'total_issues': 0,
'should_halt': False
}
# Check if any commented chapters are within our target directories
for commented_chapter in commented_chapters:
chapter_path = Path(commented_chapter)
# Check if this commented chapter is within any target directory
for directory in directory_set:
try:
chapter_path.relative_to(directory)
# If we get here, the commented chapter is within this directory
issues['commented_in_target_dirs'].append({
'chapter': commented_chapter,
'directory': directory
})
break
except ValueError:
# Not within this directory, continue
continue
issues['total_issues'] = len(issues['commented_in_target_dirs'])
issues['should_halt'] = issues['total_issues'] > 0
return issues
def print_commented_chapter_issues(self, issues: Dict):
"""Print issues about commented chapters and halt if necessary."""
if issues['total_issues'] == 0:
return False
print(f"\n🚨 CRITICAL ISSUE:")
print(f"Found {issues['total_issues']} commented chapters in target directories")
print(f"Processing cannot continue as QMD files will be inconsistent.")
print(f"\n📁 Commented chapters in target directories:")
for item in issues['commented_in_target_dirs']:
print(f"{item['chapter']} (in directory: {item['directory']})")
print(f"\n💡 To fix:")
print(f" 1. Uncomment these chapters in _quarto.yml, OR")
print(f" 2. Exclude these directories from processing, OR")
print(f" 3. Run 'quarto render --to=titlepage-pdf' after uncommenting")
print(f"\n❌ HALTING EXECUTION - Please resolve these issues first.")
return True # Should halt
def normalize_caption_punctuation(self, caption: str) -> str:
"""Ensure caption ends with a period for academic formatting."""
if not caption:
return caption
caption = caption.strip()
# Don't add period if already ends with punctuation
if caption and not caption.endswith(('.', '!', '?')):
caption += '.'
return caption
def normalize_caption_case(self, caption: str) -> str:
"""Normalize caption case using proper sentence case for technical content."""
if not caption:
return caption
# Check if this follows **Bold**: explanation format
bold_pattern = r'^(\*\*[^*]+\*\*:\s*)(.+)$'
match = re.match(bold_pattern, caption.strip())
if match:
# Only apply sentence case to the explanation part after the colon
bold_part = match.group(1) # **Bold**:
explanation_part = match.group(2) # explanation text
fixed_explanation = self.apply_sentence_case(explanation_part)
return bold_part + fixed_explanation
else:
# Apply sentence case to entire caption if not in **Bold**: format
return self.apply_sentence_case(caption)
def apply_sentence_case(self, text: str) -> str:
"""
Apply proper sentence case using comprehensive rules for technical content.
Rules:
- Capitalize first word
- Preserve proper nouns, acronyms, and technical terms
- Handle contractions and possessives correctly
- Don't lowercase words that should stay capitalized
"""
if not text:
return text
# Comprehensive list of terms to preserve (case-sensitive)
preserve_exact = {
# Technical acronyms
'AI', 'ML', 'IoT', 'GPU', 'CPU', 'API', 'UI', 'UX', 'PDF', 'HTML', 'JSON', 'XML',
'HTTP', 'HTTPS', 'SQL', 'NoSQL', 'REST', 'SOAP', 'TCP', 'UDP', 'IP', 'DNS',
'TinyML', 'MLOps', 'DevOps', 'CI/CD', 'SDK', 'IDE', 'CLI', 'GUI',
# Companies and products
'AlexNet', 'FarmBeats', 'TikZ', 'LaTeX', 'GitHub', 'YouTube', 'Microsoft',
'Google', 'Amazon', 'Facebook', 'Netflix', 'Tesla', 'OpenAI', 'NVIDIA',
'PyTorch', 'TensorFlow', 'Keras', 'Scikit-learn',
# Research terms
'CNN', 'RNN', 'LSTM', 'GRU', 'BERT', 'GPT', 'ResNet', 'VGG', 'YOLO',
'IoU', 'mAP', 'BLEU', 'ROUGE', 'F1', 'ROC', 'AUC', 'MSE', 'RMSE',
'SGD', 'Adam', 'AdaGrad', 'RMSprop',
# File formats and standards
'PNG', 'JPEG', 'SVG', 'CSV', 'YAML', 'TOML', 'HDF5', 'ONNX',
'Docker', 'Kubernetes', 'AWS', 'GCP', 'Azure', 'S3',
# Programming languages and tools
'Python', 'JavaScript', 'TypeScript', 'Java', 'C++', 'C#', 'Go', 'Rust',
'React', 'Vue', 'Angular', 'Node.js', 'MongoDB', 'PostgreSQL', 'Redis'
}
# Split into words while preserving spaces and punctuation
tokens = re.findall(r'\b\w+(?:\'\w+)?\b|\s+|[^\w\s]', text)
result_tokens = []
word_index = 0 # Track actual words (not spaces/punctuation)
for token in tokens:
if re.match(r'\s+', token): # Preserve whitespace
result_tokens.append(token)
elif not re.match(r'\w', token): # Preserve punctuation
result_tokens.append(token)
else: # Process words
# First word is always capitalized
if word_index == 0:
# Check if it's a preserved term first
if token.upper() in [p.upper() for p in preserve_exact]:
# Find the exact preserved form
preserved_form = next(p for p in preserve_exact if p.upper() == token.upper())
result_tokens.append(preserved_form)
else:
result_tokens.append(token.capitalize())
else:
# Check if word should be preserved as-is
if token.upper() in [p.upper() for p in preserve_exact]:
# Find the exact preserved form
preserved_form = next(p for p in preserve_exact if p.upper() == token.upper())
result_tokens.append(preserved_form)
elif token.isupper() and len(token) > 1:
# Preserve all-caps words (likely acronyms)
result_tokens.append(token)
else:
# Apply lowercase for regular words
result_tokens.append(token.lower())
word_index += 1
return ''.join(result_tokens)
def format_bold_explanation_caption(self, caption: str) -> str:
"""
Format caption to ensure proper **bold**: explanation capitalization.
Bold part: Title Case, Explanation part: Proper sentence case
"""
if not caption or '**' not in caption or ':' not in caption:
return caption
# Parse **bold**: explanation format
match = re.match(r'^\*\*([^*]+)\*\*:\s*(.+)$', caption.strip())
if not match:
return caption
bold_part = match.group(1).strip()
explanation_part = match.group(2).strip()
# Apply title case to bold part
bold_part = titlecase(bold_part)
# Apply proper sentence case to explanation part
explanation_part = self.apply_sentence_case(explanation_part)
return f"**{bold_part}**: {explanation_part}"
def fix_capitalization_after_periods(self, text: str) -> str:
"""
Fix capitalization after periods, handling edge cases properly.
"""
if not text:
return text
# Common abbreviations that shouldn't trigger capitalization of next word
abbreviations = {
'dr.', 'prof.', 'mr.', 'mrs.', 'ms.', 'vs.', 'etc.', 'i.e.', 'e.g.',
'fig.', 'tbl.', 'eq.', 'sec.', 'ch.', 'vol.', 'no.', 'p.', 'pp.',
'ml.', 'ai.', 'gpu.', 'cpu.', 'api.', 'url.', 'http.', 'https.'
}
# Split into sentences while preserving the structure
sentences = re.split(r'(\. )', text)
result_parts = []
for i, part in enumerate(sentences):
if i == 0:
# First part - capitalize first letter
if part and part[0].islower():
part = part[0].upper() + part[1:]
elif part == '. ' and i + 1 < len(sentences):
# Period followed by space - check next part
next_part = sentences[i + 1] if i + 1 < len(sentences) else ""
if next_part:
# Check if previous word was an abbreviation
prev_parts = ''.join(sentences[:i])
words = prev_parts.split()
last_word = words[-1].lower() if words else ""
# If not an abbreviation, capitalize next sentence
if last_word not in abbreviations:
if next_part and next_part[0].islower():
sentences[i + 1] = next_part[0].upper() + next_part[1:]
result_parts.append(part)
return ''.join(result_parts)
def improve_sentence_starters(self, text: str) -> str:
"""
Replace weak sentence starters and mid-sentence weak patterns with stronger, more direct language.
"""
if not text:
return text
# Split into sentences and improve each one
sentences = re.split(r'(\. )', text)
improved_sentences = []
for sentence in sentences:
if sentence == '. ':
improved_sentences.append(sentence)
continue
original = sentence.strip()
if not original:
improved_sentences.append(sentence)
continue
# Apply improvements to this sentence
improved = original
# Patterns for beginning-of-sentence weak starters
beginning_patterns = [
# "Illustrates how X" -> "X" (direct approach)
(r'^illustrates how (.+)$', r'\1'),
(r'^shows how (.+)$', r'\1'),
(r'^demonstrates how (.+)$', r'\1'),
(r'^depicts how (.+)$', r'\1'),
(r'^reveals how (.+)$', r'\1'),
(r'^highlights how (.+)$', r'\1'),
(r'^visualizes how (.+)$', r'\1'),
(r'^exemplifies how (.+)$', r'\1'),
(r'^traces how (.+)$', r'\1'),
(r'^explains how (.+)$', r'\1'),
(r'^displays how (.+)$', r'\1'),
(r'^presents how (.+)$', r'\1'),
# "Illustrates the X" -> "The X" (remove weak verb)
(r'^illustrates the (.+)$', r'The \1'),
(r'^shows the (.+)$', r'The \1'),
(r'^demonstrates the (.+)$', r'The \1'),
(r'^depicts the (.+)$', r'The \1'),
(r'^reveals the (.+)$', r'The \1'),
(r'^highlights the (.+)$', r'The \1'),
(r'^visualizes the (.+)$', r'The \1'),
(r'^exemplifies the (.+)$', r'The \1'),
(r'^traces the (.+)$', r'The \1'),
(r'^explains the (.+)$', r'The \1'),
(r'^displays the (.+)$', r'The \1'),
(r'^presents the (.+)$', r'The \1'),
# Generic weak starters at beginning - remove entirely
(r'^illustrates (.+)$', r'\1'),
(r'^shows (.+)$', r'\1'),
(r'^demonstrates (.+)$', r'\1'),
(r'^depicts (.+)$', r'\1'),
(r'^reveals (.+)$', r'\1'),
(r'^highlights (.+)$', r'\1'),
(r'^visualizes (.+)$', r'\1'),
(r'^exemplifies (.+)$', r'\1'),
(r'^traces (.+)$', r'\1'),
(r'^explains (.+)$', r'\1'),
(r'^displays (.+)$', r'\1'),
(r'^presents (.+)$', r'\1'),
]
# Apply beginning-of-sentence patterns first
for pattern, replacement in beginning_patterns:
if re.search(pattern, improved, re.IGNORECASE):
improved = re.sub(pattern, replacement, improved, flags=re.IGNORECASE)
break
# Patterns for mid-sentence weak constructions
mid_sentence_patterns = [
# "X illustrates how Y" -> stronger constructions
(r'(.+?)\s+illustrates how (.+)', r'\2 through \1'),
(r'(.+?)\s+demonstrates how (.+)', r'\2 via \1'),
(r'(.+?)\s+depicts how (.+)', r'\2 using \1'),
(r'(.+?)\s+reveals how (.+)', r'\2 through \1'),
(r'(.+?)\s+highlights how (.+)', r'\2 via \1'),
(r'(.+?)\s+visualizes how (.+)', r'\2 through \1'),
(r'(.+?)\s+exemplifies how (.+)', r'\2 via \1'),
(r'(.+?)\s+traces how (.+)', r'\2 through \1'),
(r'(.+?)\s+explains how (.+)', r'\2 via \1'),
(r'(.+?)\s+displays how (.+)', r'\2 using \1'),
(r'(.+?)\s+presents how (.+)', r'\2 through \1'),
# "X illustrates that Y" -> "X confirms that Y" / "X establishes that Y"
(r'(.+?)\s+illustrates that (.+)', r'\1 confirms that \2'),
(r'(.+?)\s+demonstrates that (.+)', r'\1 establishes that \2'),
(r'(.+?)\s+depicts that (.+)', r'\1 confirms that \2'),
(r'(.+?)\s+reveals that (.+)', r'\1 establishes that \2'),
(r'(.+?)\s+highlights that (.+)', r'\1 emphasizes that \2'),
(r'(.+?)\s+visualizes that (.+)', r'\1 confirms that \2'),
(r'(.+?)\s+exemplifies that (.+)', r'\1 establishes that \2'),
(r'(.+?)\s+traces that (.+)', r'\1 confirms that \2'),
(r'(.+?)\s+explains that (.+)', r'\1 establishes that \2'),
(r'(.+?)\s+displays that (.+)', r'\1 confirms that \2'),
(r'(.+?)\s+presents that (.+)', r'\1 establishes that \2'),
# "X illustrates Y" -> "X enables Y" / "X provides Y"
(r'(.+?)\s+illustrates (.+)', r'\1 enables \2'),
(r'(.+?)\s+demonstrates (.+)', r'\1 provides \2'),
(r'(.+?)\s+depicts (.+)', r'\1 presents \2'),
(r'(.+?)\s+reveals (.+)', r'\1 exposes \2'),
(r'(.+?)\s+highlights (.+)', r'\1 emphasizes \2'),
(r'(.+?)\s+visualizes (.+)', r'\1 presents \2'),
(r'(.+?)\s+exemplifies (.+)', r'\1 provides \2'),
(r'(.+?)\s+traces (.+)', r'\1 reveals \2'),
(r'(.+?)\s+explains (.+)', r'\1 clarifies \2'),
(r'(.+?)\s+displays (.+)', r'\1 presents \2'),
(r'(.+?)\s+presents (.+)', r'\1 provides \2'),
]
# Apply mid-sentence patterns
for pattern, replacement in mid_sentence_patterns:
if re.search(pattern, improved, re.IGNORECASE):
improved = re.sub(pattern, replacement, improved, flags=re.IGNORECASE)
break
# Ensure first letter is capitalized
if improved and improved[0].islower():
improved = improved[0].upper() + improved[1:]
# Add the improved sentence (spacing will be normalized later)
improved_sentences.append(improved)
return ''.join(improved_sentences)
def normalize_spacing(self, text: str) -> str:
"""
Normalize spacing in text - remove multiple spaces, leading/trailing spaces.
"""
if not text:
return text
# Replace multiple spaces with single space
text = re.sub(r'\s+', ' ', text)
# Remove leading/trailing spaces
text = text.strip()
return text
def escape_caption_for_regex(self, caption: str) -> str:
"""
Escape caption text for use in regex patterns.
Unlike re.escape(), this only escapes true regex metacharacters
that would break pattern matching, while preserving normal text
characters like parentheses () which are common in captions.
Args:
caption: Caption text that may contain special characters
Returns:
Caption with only problematic regex chars escaped
"""
if not caption:
return caption
# Only escape characters that are actual regex metacharacters
# and would break pattern matching. Common caption chars like () are preserved.
# Regex metacharacters to escape: . ^ $ * + ? { } [ ] \ |
metacharacters = r'\.^$*+?{}[]\\|'
escaped = ''
for char in caption:
if char in metacharacters:
escaped += '\\' + char
else:
escaped += char
return escaped
def ensure_yaml_safe_caption(self, caption: str) -> str:
"""
Ensure caption is safe for YAML parsing by adding quotes when needed.
YAML interprets text starting with ** as alias references, which causes parsing errors.
This function adds quotes around captions that start with ** to prevent YAML issues.
Args:
caption: The caption text to make YAML-safe
Returns:
Caption with proper YAML quoting if needed
"""
if not caption:
return caption
caption = caption.strip()
# Check if caption starts with ** (which causes YAML parsing issues)
if caption.startswith('**'):
# Check if it's already properly quoted
if (caption.startswith('"') and caption.endswith('"')) or \
(caption.startswith("'") and caption.endswith("'")):
# Already quoted, return as-is
return caption
else:
# Add double quotes to make it YAML-safe
# Escape any existing double quotes within the caption
escaped_caption = caption.replace('"', '\\"')
return f'"{escaped_caption}"'
return caption
def extract_caption_from_yaml_value(self, yaml_value: str) -> str:
"""
Extract the actual caption text from a YAML value, handling quoted and unquoted cases.
Args:
yaml_value: The YAML value which might be quoted or unquoted
Returns:
The clean caption text without YAML quoting
"""
if not yaml_value:
return yaml_value
yaml_value = yaml_value.strip()
# Handle double quotes
if yaml_value.startswith('"') and yaml_value.endswith('"'):
# Remove quotes and unescape any escaped quotes
clean_caption = yaml_value[1:-1].replace('\\"', '"')
return clean_caption
# Handle single quotes
if yaml_value.startswith("'") and yaml_value.endswith("'"):
# Remove quotes and handle single quote escaping
clean_caption = yaml_value[1:-1].replace("''", "'")
return clean_caption
# Not quoted, return as-is
return yaml_value
def validate_and_improve_caption(self, caption: str, is_table: bool = False) -> str:
"""
Apply all quality improvements to a caption.
Args:
caption: The caption to improve
is_table: Whether this is a table caption (affects format requirements)
Returns:
Improved caption with proper capitalization, strong language, and format
"""
if not caption:
return caption
# Normalize input spacing first
caption = self.normalize_spacing(caption)
# Check if caption already has table prefix and remove it for processing
has_table_prefix = caption.startswith(': ')
if has_table_prefix:
caption = caption[2:] # Remove ': ' prefix
# Clean up any extra leading colons and spaces (handles edge cases like ::, : :, etc.)
caption = re.sub(r'^:+\s*', '', caption)
# Parse **bold**: explanation format (handle spaces around colon)
match = re.match(r'^(\*\*[^*]+\*\*)\s*:\s*(.+)$', caption)
if not match:
return caption
bold_part = match.group(1)
explanation = match.group(2)
# Improve sentence starters in explanation
explanation = self.improve_sentence_starters(explanation)
# Fix capitalization after periods
explanation = self.fix_capitalization_after_periods(explanation)
# Normalize spacing in explanation
explanation = self.normalize_spacing(explanation)
# Combine with proper single space after colon
improved = f"{bold_part}: {explanation}"
return improved
def encode_image(self, image_path: str) -> Optional[str]:
"""Encode an image to base64 for multimodal models."""
try:
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
except Exception as e:
print(f"❌ Error encoding image {image_path}: {e}")
return None
def extract_paragraph_context(self, content: str, figure_or_table_id: str) -> Dict[str, str]:
"""Extract focused context: paragraph with figure + adjacent paragraphs."""
# Split content into paragraphs (double newlines or section breaks)
# Keep section headers with their following content
paragraphs = re.split(r'\n\s*\n', content)
paragraphs = [p.strip() for p in paragraphs if p.strip()]
# Find which paragraph contains our figure/table
target_paragraph_idx = None
section_title = "Unknown Section"
for i, paragraph in enumerate(paragraphs):
# Check if this paragraph contains the figure definition or reference
if (figure_or_table_id in paragraph or
f"@{figure_or_table_id}" in paragraph):
target_paragraph_idx = i
break
if target_paragraph_idx is None:
# Fallback: search in full content and extract around location
return self._extract_fallback_context(content, figure_or_table_id)
# Extract section title from content before the target paragraph
full_content_before = '\n\n'.join(paragraphs[:target_paragraph_idx + 1])
section_headers = re.findall(r'^##\s+([^#\n]+?)(?:\s*\{#[^}]+\}.*)?$',
full_content_before, re.MULTILINE)
if section_headers:
section_title = section_headers[-1].strip() # Get the most recent header
# Collect context paragraphs: [previous, current, next]
context_paragraphs = []
# Previous paragraph (if exists)
if target_paragraph_idx > 0:
prev_para = paragraphs[target_paragraph_idx - 1]
# Skip if it's just a section header
if not re.match(r'^##\s+', prev_para):
context_paragraphs.append(prev_para)
# Current paragraph (with the figure)
context_paragraphs.append(paragraphs[target_paragraph_idx])
# Next paragraph (if exists)
if target_paragraph_idx + 1 < len(paragraphs):
next_para = paragraphs[target_paragraph_idx + 1]
# Skip if it's a section header
if not re.match(r'^##\s+', next_para):
context_paragraphs.append(next_para)
context_content = '\n\n'.join(context_paragraphs)
return {
'title': section_title,
'content': context_content
}
def _extract_fallback_context(self, content: str, figure_or_table_id: str) -> Dict[str, str]:
"""Fallback: extract ±300 words around figure reference."""
lines = content.split('\n')
section_title = "Unknown Section"
for i, line in enumerate(lines):
if figure_or_table_id in line or f"@{figure_or_table_id}" in line:
# Find section heading
for j in range(i, -1, -1):
check_line = lines[j].strip()
if check_line.startswith('##') and not check_line.startswith('###'):
section_title = re.sub(r'^#+\s*', '', check_line)
section_title = re.sub(r'\s*\{#[^}]+\}.*$', '', section_title)
break
# Extract context around reference (±10 lines, then expand to word boundaries)
start_idx = max(0, i - 10)
end_idx = min(len(lines), i + 10)
context_text = '\n'.join(lines[start_idx:end_idx]).strip()
# Limit to ~300 words around the figure
words = context_text.split()
if len(words) > 300:
# Find the figure reference position in words
fig_word_pos = None
for word_idx, word in enumerate(words):
if figure_or_table_id in word:
fig_word_pos = word_idx
break
if fig_word_pos:
start_word = max(0, fig_word_pos - 150)
end_word = min(len(words), fig_word_pos + 150)
context_text = ' '.join(words[start_word:end_word])
return {
'title': section_title,
'content': context_text
}
# Ultimate fallback
return {
'title': "Unknown Section",
'content': content[:1000]
}
def generate_caption_with_ollama(self, section_title: str, section_text: str,
figure_id: str, current_caption: str,
image_path: Optional[str] = None, is_table: bool = False,
is_listing: bool = False, code_content: Optional[str] = None) -> Optional[str]:
"""Generate improved caption using Ollama multimodal model with retry logic."""
import time
# Construct a focused, context-aware prompt
content_type = "code listing" if is_listing else "visual (figure or table)"
prompt = f"""You are an expert at editing a caption for a {content_type} in a technical AI/ML systems textbook.
Your task is to improve the caption so that it *teaches*. The goal is to help students understand what the {"code demonstrates or implements" if is_listing else "visual conveys"} in the context of machine learning systems.
SECTION: {section_title}
ORIGINAL CAPTION: {current_caption}
TEXTBOOK CONTEXT (for reference):
{section_text[:1500]}
{f'''CODE CONTENT:
{code_content[:1500] if code_content else "No code content provided"}
''' if is_listing else ""}
🧠 TASK: Rewrite the caption to make it educational, precise, and aligned with the visual's teaching purpose.
✍️ FORMAT:
**<Key Phrase>**: Explanation sentence(s)
🚫 CRITICAL RULE - NEVER START WITH WEAK DESCRIPTIVE VERBS:
You must NEVER begin explanation sentences with these weak words:
"Shows", "Demonstrates", "Illustrates", "Depicts", "Reveals", "Highlights", "Displays", "Presents", "Exhibits", "Portrays", "Visualizes", "Exemplifies", "Traces", "Explains"
Instead, write DIRECT, ACTIVE statements:
❌ BAD: "Shows how neural networks process data"
✅ GOOD: "Neural networks process data through layered transformations"
❌ BAD: "Demonstrates the relationship between accuracy and efficiency"
✅ GOOD: "Higher model accuracy requires more computational resources"
✅ REQUIREMENTS:
1. **Key Phrase**: A single bolded noun phrase (15 words) that captures the main idea. Avoid full sentences or multiple bolded phrases. If similar figures exist in this section, choose a unique but relevant phrase.
2. **Explanation**: 12 concise, natural sentences that express what the student *learns* from the figure or table. Use active voice. Avoid simply describing what the figure "shows"—explain what *insight* it provides or how it advances understanding.
3. **Terminology**: Use domain-specific language from the original caption if helpful, but rephrase it to clarify meaning for students.
4. **No Weak Openers**: Do not begin with "This figure...", "This table...", or "This diagram...". Begin with the concept or the takeaway.
5. **Clarity & Precision**: Be specific, pedagogical, and concrete. Emphasize learning outcomes over general description.
6. **Tone**: Use a textbook tone. Use technical but student-friendly language appropriate for upper-level undergraduates or early graduate students. Avoid jargon unless it is defined or central to the concept.
7. **Sources**: If the original caption includes a source (e.g., "Source: IEEE Spectrum"), retain it at the end of the caption in italics. Append it after a period.
📌 EXCELLENT TEXTBOOK EXAMPLES:
**Attention Mechanism**: Transformer models compute attention through query-key-value interactions, enabling dynamic focus across input sequences for improved language understanding.
**Farm Edge Integration**: Modern agricultural systems deploy AI directly on IoT devices to process sensor data locally, reducing latency and improving real-time decision making.
**Training Pipeline**: Machine learning workflows partition datasets into training, validation, and test sets to ensure robust model development and unbiased evaluation.
🚫 AVOID:
- Starting with "This figure shows..." or "This table illustrates..."
- Using a full sentence or list as the bold key phrase
- Repeating the section title or being too vague (e.g., **AI System**)
🖊️ OUTPUT: Write only the improved caption below:
🚫 CRITICAL: NEVER START WITH WEAK VERBS:
- BANNED WORDS: "Shows", "Demonstrates", "Illustrates", "Depicts", "Reveals", "Highlights", "Displays", "Presents", "Exhibits", "Portrays", "Visualizes", "Exemplifies", "Traces", "Explains"
- BANNED PHRASES: "This figure/table/diagram...", "As shown in...", "The illustration demonstrates...", "The visual depicts..."
- These make captions sound like descriptions, not teaching tools
✅ STRONG SENTENCE PATTERNS TO USE:
- Direct statements: "Neural networks process data through..."
- System descriptions: "The architecture combines..."
- Process explanations: "Training requires..."
- Comparative insights: "Edge computing reduces latency while..."
- Technical definitions: "Convolutional layers extract..."
- Causal relationships: "Larger models achieve higher accuracy but..."
💡 BEFORE vs AFTER EXAMPLES:
❌ WEAK: "Illustrates how neural networks process data"
✅ STRONG: "Neural networks process input data through hierarchical feature extraction"
❌ WEAK: "Shows the relationship between accuracy and efficiency"
✅ STRONG: "Model accuracy increases with computational complexity, creating efficiency trade-offs"
❌ WEAK: "Demonstrates edge computing benefits"
✅ STRONG: "Edge computing reduces latency by processing data locally rather than in the cloud"
❌ WEAK: "Visualizes the ML pipeline stages"
✅ STRONG: "Machine learning pipelines consist of data preprocessing, training, and deployment phases"
🎯 FINAL REMINDER: Write the explanation sentence(s) with DIRECT, ACTIVE language. NO weak descriptive verbs!
"""
# Retry logic: up to 3 attempts with exponential backoff
max_retries = 2
base_delay = 1 # seconds
for attempt in range(max_retries):
try:
# Prepare the request payload
payload = {
"model": self.model_name,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.7, # Higher temperature for more diverse, creative captions
"num_predict": -1, # No token limit - generate complete responses
"top_p": 0.9 # Add nucleus sampling for better variety
}
}
# Add image if provided (for multimodal models)
if image_path and os.path.exists(image_path):
encoded_image = self.encode_image(image_path)
if encoded_image:
payload["images"] = [encoded_image]
# Make request to Ollama
response = requests.post(
"http://localhost:11434/api/generate",
json=payload,
timeout=60
)
if response.status_code == 200:
result = response.json()
new_caption = result.get('response', '').strip()
# Clean up any markdown code blocks
if new_caption.startswith('```') and new_caption.endswith('```'):
new_caption = new_caption.strip('`').strip()
if new_caption.startswith('json\n'):
new_caption = new_caption[5:].strip()
# Log word count but never reject based on length
word_count = len(new_caption.split())
if word_count > 150:
print(f" Long caption generated ({word_count} words): {new_caption[:100]}...")
# Continue processing - user wants NO truncation limits
# Validate the format contains **bold**:
if '**' in new_caption and ':' in new_caption:
# Apply comprehensive quality improvements
formatted_caption = self.format_bold_explanation_caption(new_caption)
improved_caption = self.validate_and_improve_caption(formatted_caption, is_table)
# Log final word count but never reject
final_word_count = len(improved_caption.split())
if final_word_count > 150:
print(f" Large improved caption ({final_word_count} words): continuing anyway...")
return improved_caption
else:
print(f" ⚠️ Generated caption doesn't follow **bold**: format: {new_caption[:100]}")
# Don't retry for format issues - this is a generation problem, not API error
return None
else:
# API error - this is worth retrying
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
print(f" ⚠️ Ollama API error {response.status_code}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
print(f" ❌ Ollama API error: {response.status_code} (all {max_retries} attempts failed)")
return None
except requests.exceptions.RequestException as e:
# Network/connection error - worth retrying
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
print(f" ⚠️ Request error: {e}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
print(f" ❌ Request error: {e} (all {max_retries} attempts failed)")
return None
except Exception as e:
# Unexpected error - worth retrying once but likely a code issue
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt)
print(f" ⚠️ Unexpected error: {e}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
time.sleep(delay)
continue
else:
print(f" ❌ Unexpected error: {e} (all {max_retries} attempts failed)")
return None
# Should never reach here due to the loop structure, but just in case
return None
def compile_tikz_to_image(self, tikz_code: str, figure_id: str) -> Optional[str]:
"""Compile TikZ code to a PNG image for multimodal processing."""
temp_dir = Path("temp_tikz")
temp_dir.mkdir(exist_ok=True)
tex_file = temp_dir / f"{figure_id}.tex"
pdf_file = temp_dir / f"{figure_id}.pdf"
png_file = temp_dir / f"{figure_id}.png"
# Create minimal LaTeX document with TikZ
latex_content = f"""\\documentclass{{standalone}}
\\usepackage{{tikz}}
\\usepackage{{pgfplots}}
\\usetikzlibrary{{positioning,arrows,shapes,calc}}
\\begin{{document}}
{tikz_code}
\\end{{document}}"""
try:
# Write LaTeX file
with open(tex_file, 'w', encoding='utf-8') as f:
f.write(latex_content)
# Compile to PDF
result = subprocess.run(
["pdflatex", "-output-directory", str(temp_dir), str(tex_file)],
capture_output=True,
text=True,
timeout=30
)
if result.returncode != 0 or not pdf_file.exists():
print(f" ❌ LaTeX compilation failed for {figure_id}")
return None
# Convert PDF to PNG using ImageMagick
result = subprocess.run(
["magick", "convert", "-density", "150", str(pdf_file), str(png_file)],
capture_output=True,
text=True,
timeout=15
)
if result.returncode != 0 or not png_file.exists():
print(f" ❌ PDF to PNG conversion failed for {figure_id}")
return None
return str(png_file)
except subprocess.TimeoutExpired:
print(f" ❌ Compilation timeout for {figure_id}")
return None
except Exception as e:
print(f" ❌ Compilation error for {figure_id}: {e}")
return None
finally:
# Clean up intermediate files
for temp_file in [tex_file, pdf_file, tex_file.with_suffix('.aux'),
tex_file.with_suffix('.log')]:
if temp_file.exists():
try:
temp_file.unlink()
except:
pass
def parse_sections(self, content: str) -> List[Dict[str, any]]:
"""Parse QMD content to extract sections (content-based, no line numbers)."""
# Find all section headers using regex
section_pattern = r'^##\s+([^#\n]+?)(?:\s*\{#[^}]+\}.*)?$'
sections = []
# Split content by section headers
parts = re.split(section_pattern, content, flags=re.MULTILINE)
if len(parts) > 1:
# First part is content before any section (if any)
if parts[0].strip():
sections.append({
'title': 'Introduction',
'content': parts[0].strip()
})
# Process section pairs (title, content)
for i in range(1, len(parts), 2):
if i + 1 < len(parts):
title = parts[i].strip()
content_part = parts[i + 1].strip()
sections.append({
'title': title,
'content': content_part
})
return sections
def load_content_map(self) -> Dict:
"""Load existing content map from JSON file."""
if not os.path.exists(self.content_map_file):
print(f"❌ Content map not found: {self.content_map_file}")
print(f"💡 Run: python {os.path.basename(__file__)} --build-map")
return {}
try:
with open(self.content_map_file, 'r', encoding='utf-8') as f:
content_map = json.load(f)
print(f"📋 Loaded content map: {len(content_map.get('figures', {}))} figures, {len(content_map.get('tables', {}))} tables")
return content_map
except Exception as e:
error_msg = f"Error loading content map: {e}"
print(f"{error_msg}")
self.stats['errors'].append(error_msg)
return {}
def save_content_map(self, content_map: Dict):
"""Save content map to JSON file with proper serialization."""
def convert_paths_to_strings(obj):
"""Recursively convert Path objects to strings for JSON serialization."""
if isinstance(obj, Path):
return str(obj)
elif isinstance(obj, dict):
return {key: convert_paths_to_strings(value) for key, value in obj.items()}
elif isinstance(obj, list):
return [convert_paths_to_strings(item) for item in obj]
else:
return obj
try:
# Convert any Path objects to strings
serializable_map = convert_paths_to_strings(content_map)
with open(self.content_map_file, 'w', encoding='utf-8') as f:
json.dump(serializable_map, f, indent=2, ensure_ascii=False)
print(f"💾 Content map saved to: {self.content_map_file}")
except Exception as e:
print(f"❌ Error saving content map: {e}")
raise
def find_figure_definition_in_qmd(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
"""
Unified figure detection across all supported formats.
Detects 4 types of figures in order of specificity:
1. Code-generated figures: ```{r|python} #| label: fig-id #| fig-cap: "caption" ...
2. TikZ/Div figures: ::: {#fig-id} ```{.tikz} ... ``` caption :::
3. Markdown figures: ![caption](path){#fig-id}
Note: R figures are detected as 'code' type with language='r'
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-example")
Returns:
Dict with type-specific information or None if not found
"""
# Try each detection method in order of specificity
detectors = [
self.detect_code_figure,
self.detect_tikz_figure,
self.detect_markdown_figure
]
for detector in detectors:
result = detector(content, fig_id)
if result:
return result
return None
def detect_markdown_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
"""
Detect standard markdown figures with precise pattern matching.
Required format: ![caption](image_path){#fig-id optional-attributes}
Examples from the codebase:
![Simple caption](images/example.png){#fig-example}
![**Bold Caption**: Description](./images/png/file.png){#fig-example}
![Caption with [citation]](path.jpg){#fig-example width=40%}
![Caption](images/file.png){#fig-example width=95% fig-pos='H'}
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-example")
Returns:
Dict with 'caption', 'path', 'full_match' or None if not found
"""
# Precise pattern based on actual codebase format:
# ![caption](path){#fig-id optional-attributes}
# Handle nested brackets in captions and ensure exact ID match
escaped_fig_id = re.escape(fig_id)
pattern = rf'!\[((?:[^\[\]]|\[[^\]]*\])*)\]\(([^)]+)\)\s*\{{\s*#{escaped_fig_id}(?=\s|}})[^}}]*\}}'
match = re.search(pattern, content, re.MULTILINE)
if not match:
return None
caption = match.group(1).strip()
path = match.group(2).strip()
full_text = match.group(0)
return {
'type': 'markdown',
'caption': caption,
'path': path,
'full_match': full_text,
'start': match.start(),
'end': match.end()
}
def detect_r_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
"""
Detect R code block figures with labels and fig-cap.
Required format:
```{r}
#| label: fig-id
#| fig-cap: "Caption text"
#| echo: false (optional)
... R code ...
```
Examples from the codebase:
```{r}
#| label: fig-imagenet-challenge
#| fig-cap: "ImageNet accuracy improvements over the years."
#| echo: false
... R plotting code ...
```
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-imagenet-challenge")
Returns:
Dict with 'caption', 'r_code', 'full_match' or None if not found
"""
# Pattern to match R code blocks with specific label and fig-cap
# Handle both quoted and unquoted fig-cap values
escaped_fig_id = re.escape(fig_id)
# Try pattern with label first, then fig-cap (with or without quotes)
pattern = rf'```\{{r\}}(.*?)#\|\s*label:\s*{escaped_fig_id}\s*\n(.*?)#\|\s*fig-cap:\s*(?:["\']([^"\']*)["\']|([^\n]*))([^`]*?)```'
match = re.search(pattern, content, re.DOTALL)
if match:
# Extract caption from either quoted (group 3) or unquoted (group 4)
caption = (match.group(3) or match.group(4) or '').strip()
else:
# Try alternative pattern where fig-cap comes before label
pattern = rf'```\{{r\}}(.*?)#\|\s*fig-cap:\s*(?:["\']([^"\']*)["\']|([^\n]*?))\s*\n(.*?)#\|\s*label:\s*{escaped_fig_id}([^`]*?)```'
match = re.search(pattern, content, re.DOTALL)
if match:
# Extract caption from either quoted (group 2) or unquoted (group 3)
caption = (match.group(2) or match.group(3) or '').strip()
else:
return None
# Extract the full R code
r_code = match.group(0)[7:-3] # Remove ```{r} and ``` wrappers
return {
'type': 'r',
'caption': caption,
'r_code': r_code.strip(),
'full_match': match.group(0),
'start': match.start(),
'end': match.end()
}
def detect_tikz_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
"""
Detect TikZ/Div block figures with robust structure matching.
Required structure:
::: {attributes with #fig-id}
```{.tikz}
% TikZ code here
```
Caption text here
:::
Examples from the codebase:
::: {#fig-neural-network}
::: {width=80% #fig-neural-network}
::: {#fig-neural-network .column-margin}
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-neural-network")
Returns:
Dict with 'caption', 'tikz_code', 'full_match' or None if not found
"""
# Robust pattern: Must have div block with EXACT fig-id AND tikz code block
# This ensures we only match actual TikZ figures, not other div blocks
pattern = rf':::\s*\{{[^}}]*#{re.escape(fig_id)}(?=\s|[}}])[^}}]*\}}\s*\n(.*?):::'
match = re.search(pattern, content, re.DOTALL)
if not match:
return None
div_content = match.group(1)
# Verify this actually contains a TikZ code block
tikz_match = re.search(r'```\s*\{\.tikz\}\s*(.*?)\s*```', div_content, re.DOTALL)
if not tikz_match:
return None # Not a TikZ figure if no tikz code block
tikz_code = tikz_match.group(1).strip()
# Extract caption: text after the tikz block but before :::
tikz_end = tikz_match.end()
caption_text = div_content[tikz_end:].strip()
# Clean up caption - remove empty lines and trailing whitespace
caption_lines = [line.strip() for line in caption_text.split('\n') if line.strip()]
caption = ' '.join(caption_lines) if caption_lines else ""
return {
'type': 'tikz',
'caption': caption,
'tikz_code': tikz_code,
'full_match': match.group(0),
'start': match.start(),
'end': match.end()
}
def detect_code_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
"""
Detect code-generated figures (R/Python blocks).
Format:
```{r}
#| label: fig-id
#| fig-cap: "Caption text"
#| other-options: values
# R or Python code here
ggplot(data) + ...
```
Examples:
```{r}
#| label: fig-datacenter-energy
#| fig-cap: "Energy usage over time"
#| echo: false
library(ggplot2)
ggplot(data, aes(x=year, y=usage)) + geom_line()
```
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-datacenter-energy")
Returns:
Dict with 'caption', 'code', 'language', 'full_match' or None if not found
"""
# Pattern: ```{r|python} ... #| label: fig-id ... #| fig-cap: "caption" ... ```
pattern = rf'```\{{(r|python)[^}}]*\}}([^`]*?#\|\s*label:\s*{re.escape(fig_id)}[^`]*?)```'
match = re.search(pattern, content, re.DOTALL | re.MULTILINE)
if match:
language = match.group(1)
code_block = match.group(2)
# Extract fig-cap from the code block (handle both quoted and unquoted)
# For R figures: Handle YAML-quoted values properly (captions starting with **)
# For other languages: Use simpler extraction
if language == 'r':
# Enhanced regex to handle YAML-quoted values for R figures
cap_pattern = r'#\|\s*fig-cap:\s*(.+)$'
cap_match = re.search(cap_pattern, code_block, re.MULTILINE)
if cap_match:
# Extract the full YAML value and then clean it
yaml_value = cap_match.group(1).strip()
# Remove quotes if they exist and extract clean caption
caption = self.extract_caption_from_yaml_value(yaml_value)
else:
caption = ""
else:
# Standard extraction for non-R figures
cap_pattern = r'#\|\s*fig-cap:\s*(?:"([^"]*)"|(.+))$'
cap_match = re.search(cap_pattern, code_block, re.MULTILINE)
if cap_match:
# Group 1: quoted caption, Group 2: unquoted caption
caption = (cap_match.group(1) or cap_match.group(2) or '').strip()
else:
caption = ""
return {
'type': 'code',
'caption': caption,
'language': language,
'code': code_block.strip(),
'full_match': match.group(0),
'start': match.start(),
'end': match.end()
}
return None
def detect_table(self, content: str, tbl_id: str) -> Optional[Dict[str, str]]:
"""
Detect table captions.
Supports both formats:
- Old: : Caption text {#tbl-id}
- New: Caption text {#tbl-id}
Examples:
: AI model comparison {#tbl-models} (old format)
AI model comparison {#tbl-models} (new format)
Performance metrics {width=80% #tbl-performance}
**Special Function Units**: Details... {#tbl-sfu}
Args:
content: QMD file content
tbl_id: Full table ID (e.g., "tbl-models")
Returns:
Dict with 'caption', 'full_match' or None if not found
"""
# Try old format first (with leading colon) - this must be checked first to properly strip `: ` prefix
pattern_old = rf'^:\s*([^{{\n]+?)\s*\{{[^}}]*#{re.escape(tbl_id)}(?:\s|[^}}])*\}}\s*$'
match = re.search(pattern_old, content, re.MULTILINE)
if not match:
# Fall back to new format (without leading colon) - allow colons in caption text
pattern_new = rf'^([^{{\n]+?)\s*\{{[^}}]*#{re.escape(tbl_id)}(?:\s|[^}}])*\}}\s*$'
match = re.search(pattern_new, content, re.MULTILINE)
if match:
return {
'type': 'table',
'caption': match.group(1).strip(),
'full_match': match.group(0),
'start': match.start(),
'end': match.end()
}
return None
def find_table_definition_in_qmd(self, content: str, tbl_id: str) -> Optional[Dict[str, str]]:
"""
Unified table detection using the specialized detect_table function.
Args:
content: QMD file content
tbl_id: Full table ID (e.g., "tbl-models")
Returns:
Dict with table information or None if not found
"""
return self.detect_table(content, tbl_id)
# ================================================================
# SPECIALIZED UPDATE FUNCTIONS FOR DIFFERENT FIGURE/TABLE TYPES
# ================================================================
def update_markdown_figure(self, content: str, fig_id: str, new_caption: str) -> str:
"""
Update caption in standard markdown figures.
Updates: ![old_caption](path){#fig-id} → ![new_caption](path){#fig-id}
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-ai-timeline")
new_caption: New caption text
Returns:
Updated content
"""
# Fixed pattern: Use .*? to handle nested brackets in captions (like citations)
pattern = rf'(!\[).*?(\]\([^)]+\)\s*\{{[^}}]*#{re.escape(fig_id)}(?:\s|[^}}])*\}})'
replacement = rf'\g<1>{new_caption}\g<2>'
return re.sub(pattern, replacement, content, flags=re.MULTILINE | re.DOTALL)
def update_tikz_figure(self, content: str, fig_id: str, new_caption: str) -> str:
"""
Update caption in TikZ/Div block figures.
Updates the caption text between the closing ``` and :::
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-neural-network")
new_caption: New caption text
Returns:
Updated content
"""
# Method 1: Replace caption after tikz block
# More flexible pattern: fig-id can be anywhere in the attributes
pattern = rf'(:::\s*\{{[^}}]*#{re.escape(fig_id)}(?:\s|[^}}])*\}}.*?```\s*\n\s*)([^:]+?)((?:\s*:::))'
def replace_caption(match):
before = match.group(1)
after = match.group(3)
return f"{before}{new_caption}{after}"
updated_content = re.sub(pattern, replace_caption, content, flags=re.MULTILINE | re.DOTALL)
# Method 2: If that didn't work, try simpler approach
if updated_content == content:
div_pattern = rf'(:::\s*\{{[^}}]*#{re.escape(fig_id)}(?:\s|[^}}])*\}}.*?```\s*\n)([^:]*?)(:::'
def replace_div_caption(match):
before = match.group(1)
after = match.group(3)
return f"{before}{new_caption}\n{after}"
updated_content = re.sub(div_pattern, replace_div_caption, content, flags=re.MULTILINE | re.DOTALL)
return updated_content
def update_code_figure(self, content: str, fig_id: str, new_caption: str) -> str:
"""
Update caption in code-generated figures (R/Python blocks).
Updates: #| fig-cap: "old caption" → #| fig-cap: "new caption"
For R figures: Ensures proper YAML quoting for captions starting with **
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-datacenter-energy")
new_caption: New caption text
Returns:
Updated content
"""
# Find the fig-cap line specifically for this figure's code block
pattern = rf'(```\{{(r|python)[^}}]*\}}[^`]*?#\|\s*label:\s*{re.escape(fig_id)}[^`]*?#\|\s*fig-cap:\s*)([^\n]+)'
def replace_fig_cap(match):
before = match.group(1)
language = match.group(2)
# For R figures: Ensure YAML-safe quoting (adds quotes if needed for ** captions)
# For other languages: Use standard quoting
if language == 'r':
yaml_safe_caption = self.ensure_yaml_safe_caption(new_caption)
return f'{before}{yaml_safe_caption}'
else:
return f'{before}"{new_caption}"'
return re.sub(pattern, replace_fig_cap, content, flags=re.MULTILINE | re.DOTALL)
def update_table_caption(self, content: str, tbl_id: str, new_caption: str) -> str:
"""
Update table captions using the simple, consistent format.
Format: `: [caption]. {#tbl-id [attributes]}`
Args:
content: QMD file content
tbl_id: Full table ID (e.g., "tbl-models")
new_caption: New caption text (may or may not have `: ` prefix)
Returns:
Updated content
"""
# Find the table caption line (with or without existing colon prefix)
# Match pattern: optional colon + caption + {#tbl-id ...}
pattern = rf'^:?\s*([^{{\n]+?)(\s*\{{[^}}]*#{re.escape(tbl_id)}(?:\s|[^}}])*\}})\s*$'
def replacement_func(match):
# Always use the simple format: `: new_caption {#tbl-id attributes}`
attributes = match.group(2) # Preserve the {#tbl-id ...} part
# Check if new_caption already has the `: ` prefix to avoid double colons
if new_caption.startswith(': '):
# New caption already has prefix, use as-is
formatted_caption = new_caption
else:
# Add the `: ` prefix and ensure it ends with a period
if not new_caption.endswith('.'):
formatted_caption = f': {new_caption}.'
else:
formatted_caption = f': {new_caption}'
return f'{formatted_caption} {attributes.strip()}'
updated_content = re.sub(pattern, replacement_func, content, flags=re.MULTILINE)
return updated_content
def update_table_caption_in_qmd(self, content: str, tbl_id: str, new_caption: str) -> str:
"""
Unified table caption update using the specialized update_table_caption function.
Args:
content: QMD file content
tbl_id: Full table ID (e.g., "tbl-models")
new_caption: New caption text
Returns:
Updated content
"""
return self.update_table_caption(content, tbl_id, new_caption)
# ================================================================
# UNIFIED UPDATE FUNCTION
# ================================================================
def update_figure_caption_in_qmd(self, content: str, fig_id: str, new_caption: str) -> str:
"""
Unified figure caption update across all supported formats.
Detects the figure type and calls the appropriate update function.
Args:
content: QMD file content
fig_id: Full figure ID (e.g., "fig-example")
new_caption: New caption text
Returns:
Updated content
"""
# First, determine what type of figure this is
fig_def = self.find_figure_definition_in_qmd(content, fig_id)
if not fig_def:
return content
# Route to appropriate update function based on type
if fig_def['type'] == 'markdown':
return self.update_markdown_figure(content, fig_id, new_caption)
elif fig_def['type'] == 'tikz':
return self.update_tikz_figure(content, fig_id, new_caption)
elif fig_def['type'] == 'code':
return self.update_code_figure(content, fig_id, new_caption)
else:
# Fallback to markdown method
return self.update_markdown_figure(content, fig_id, new_caption)
def print_summary(self) -> None:
"""Print a summary of the processing results."""
print(f"\n{'='*60}")
print(f"📊 CAPTION IMPROVEMENT SUMMARY")
print(f"{'='*60}")
print(f"Files processed: {self.stats['files_processed']}")
print(f"Figures found: {self.stats['figures_found']}")
print(f"Figures improved: {self.stats['figures_improved']}")
print(f"Tables found: {self.stats['tables_found']}")
print(f"Tables improved: {self.stats['tables_improved']}")
print(f"Images found: {self.stats['images_found']} 🖼️")
print(f"Images missing: {self.stats['images_missing']} ⚠️")
print(f"JSON success: {self.stats['json_success']} 📋")
print(f"JSON failed: {self.stats['json_failed']} 🚫")
if self.stats['figures_improved'] > 0:
improvement_rate = (self.stats['figures_improved'] / self.stats['figures_found']) * 100
print(f"Improvement rate: {improvement_rate:.1f}%")
if self.stats['errors']:
print(f"\n⚠️ Issues encountered ({len(self.stats['errors'])}):")
for i, error in enumerate(self.stats['errors'], 1):
print(f" {i}. {error}")
if self.stats['images_missing'] > 0:
print(f"\n💡 Tip: {self.stats['images_missing']} images were not found.")
print(f" Consider checking image paths or using text-only processing.")
if self.stats['tikz_found'] > 0:
print(f"\n🔧 TikZ Processing: Found {self.stats['tikz_found']} TikZ figures.")
if self.stats['tikz_failed'] > 0:
print(f" ⚠️ {self.stats['tikz_failed']} TikZ compilations failed.")
print(f" Ensure you have LaTeX (pdflatex) and ImageMagick (magick) installed.")
print(f"{'='*60}")
def validate_qmd_mapping(self, directories: List[str], content_map: Dict) -> Dict:
"""Scan QMD files and validate mapping for all figures/tables."""
print(f"🔍 Validating QMD mapping...")
# Check for commented chapters in target directories first
commented_issues = self.check_commented_chapters_in_directories(directories)
should_halt = self.print_commented_chapter_issues(commented_issues)
if should_halt:
return content_map
qmd_files = self.find_qmd_files_in_order(directories)
print(f"📁 Scanning {len(qmd_files)} QMD files")
# Track what we find in QMD files
found_figures = {}
found_tables = {}
missing_figures = []
missing_tables = []
# Scan all QMD files for figure/table references
for qmd_file in qmd_files:
try:
with open(qmd_file, 'r', encoding='utf-8') as f:
content = f.read()
print(f" 📄 Scanning: {qmd_file}")
# Check each figure from content map
for fig_id in content_map.get('figures', {}):
if fig_id not in found_figures:
fig_def = self.find_figure_definition_in_qmd(content, fig_id)
if fig_def:
found_figures[fig_id] = {
'qmd_file': qmd_file,
'current_caption': fig_def['caption'],
'definition': fig_def
}
print(f" ✅ Found figure: {fig_id}")
# Check each table from content map
for tbl_id in content_map.get('tables', {}):
if tbl_id not in found_tables:
tbl_def = self.find_table_definition_in_qmd(content, tbl_id)
if tbl_def:
found_tables[tbl_id] = {
'qmd_file': qmd_file,
'current_caption': tbl_def['caption'],
'definition': tbl_def
}
print(f" ✅ Found table: {tbl_id}")
except Exception as e:
print(f" ❌ Error scanning {qmd_file}: {e}")
# Identify missing items
for fig_id in content_map.get('figures', {}):
if fig_id not in found_figures:
missing_figures.append(fig_id)
for tbl_id in content_map.get('tables', {}):
if tbl_id not in found_tables:
missing_tables.append(tbl_id)
# Update content map with QMD locations
for fig_id, fig_info in found_figures.items():
content_map['figures'][fig_id]['source_file'] = fig_info['qmd_file']
content_map['figures'][fig_id]['qmd_caption'] = fig_info['current_caption']
for tbl_id, tbl_info in found_tables.items():
content_map['tables'][tbl_id]['source_file'] = tbl_info['qmd_file']
content_map['tables'][tbl_id]['qmd_caption'] = tbl_info['current_caption']
# Report validation results
print(f"\n📊 VALIDATION SUMMARY:")
print(f" Figures: {len(found_figures)}/{len(content_map.get('figures', {}))} found in QMD")
print(f" Tables: {len(found_tables)}/{len(content_map.get('tables', {}))} found in QMD")
if missing_figures:
print(f"\n❌ Missing figures ({len(missing_figures)}):")
for fig_id in missing_figures:
print(f" - {fig_id}")
if missing_tables:
print(f"\n❌ Missing tables ({len(missing_tables)}):")
for tbl_id in missing_tables:
print(f" - {tbl_id}")
if not missing_figures and not missing_tables:
print(f"\n✅ Perfect mapping! All items found in QMD files.")
# Check already performed at start of validation
return content_map
def check_caption_quality(self, directories: List[str], figures_only: bool = False, tables_only: bool = False, listings_only: bool = False, content_map: Optional[Dict] = None) -> Dict[str, any]:
"""
Analyze all captions and generate quality report.
Args:
directories: List of directories to scan
figures_only: If True, only analyze figures (ignore tables and listings)
tables_only: If True, only analyze tables (ignore figures and listings)
listings_only: If True, only analyze listings (ignore figures and tables)
content_map: Optional pre-built content map to avoid rebuilding
Returns:
Dict with quality analysis results
"""
print("🔍 Analyzing caption quality...")
# Check for commented chapters in target directories first
commented_issues = self.check_commented_chapters_in_directories(directories)
should_halt = self.print_commented_chapter_issues(commented_issues)
if should_halt:
return {}
# Use provided content map or build fresh one with filtering
if content_map is None:
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
if not content_map:
print("❌ Failed to build content map for analysis.")
return {}
# Find all QMD files
qmd_files = self.find_qmd_files_in_order(directories)
report = {
'total_captions': 0,
'captions_needing_repair': 0,
'issues_by_type': {},
'detailed_issues': []
}
# Analyze figures
for fig_id, fig_data in content_map.get('figures', {}).items():
current_caption = fig_data.get('current_caption', '')
analysis = self.quality_checker.analyze_caption(current_caption)
report['total_captions'] += 1
if analysis['needs_repair']:
report['captions_needing_repair'] += 1
report['detailed_issues'].append({
'id': fig_id,
'type': 'figure',
'current_caption': current_caption,
'issues': analysis['issues']
})
# Count issues by type
for issue in analysis['issues']:
issue_type = issue['type']
report['issues_by_type'][issue_type] = report['issues_by_type'].get(issue_type, 0) + 1
# Analyze tables
for tbl_id, tbl_data in content_map.get('tables', {}).items():
current_caption = tbl_data.get('current_caption', '')
analysis = self.quality_checker.analyze_caption(current_caption)
report['total_captions'] += 1
if analysis['needs_repair']:
report['captions_needing_repair'] += 1
report['detailed_issues'].append({
'id': tbl_id,
'type': 'table',
'current_caption': current_caption,
'issues': analysis['issues']
})
# Count issues by type
for issue in analysis['issues']:
issue_type = issue['type']
report['issues_by_type'][issue_type] = report['issues_by_type'].get(issue_type, 0) + 1
return report
def print_quality_report(self, report: Dict[str, any]):
"""Print a nicely formatted quality report."""
if not report:
return
total = report['total_captions']
needing_repair = report['captions_needing_repair']
percentage = (needing_repair / total * 100) if total > 0 else 0
print(f"\n📊 Caption Quality Report:")
print(f"={'=' * 60}")
print(f"Total captions: {total}")
print(f"Need repair: {needing_repair} ({percentage:.1f}%)")
if report['issues_by_type']:
print(f"\n🔍 Issues by type:")
for issue_type, count in report['issues_by_type'].items():
print(f"{issue_type.replace('_', ' ').title()}: {count}")
if report['detailed_issues']:
print(f"\n📝 Detailed Issues:")
print(f"┌─{'' * 18}─┬─{'' * 12}─┬─{'' * 35}─┐")
print(f"{'ID':<18}{'Issue':<12}{'Current Caption':<35}")
print(f"├─{'' * 18}─┼─{'' * 12}─┼─{'' * 35}─┤")
for issue_item in report['detailed_issues'][:20]: # Limit to first 20
item_id = issue_item['id'][:18]
caption = issue_item['current_caption'][:35]
issues_desc = ', '.join([issue['description'] for issue in issue_item['issues']])[:12]
print(f"{item_id:<18}{issues_desc:<12}{caption:<35}")
print(f"└─{'' * 18}─┴─{'' * 12}─┴─{'' * 35}─┘")
if len(report['detailed_issues']) > 20:
print(f"... and {len(report['detailed_issues']) - 20} more issues")
if needing_repair > 0:
print(f"\n💡 To fix these issues, run:")
print(f" python {__file__} --repair -d {' -d '.join(['contents/vol1/'])}")
else:
print(f"\n✅ All captions look good!")
def repair_captions(self, directories: List[str], figures_only: bool = False, tables_only: bool = False, listings_only: bool = False, specific_files: List[str] = None):
"""
Repair only captions that need fixing.
Args:
directories: List of directories to scan
figures_only: If True, only repair figures (ignore tables and listings)
tables_only: If True, only repair tables (ignore figures and listings)
listings_only: If True, only repair listings (ignore figures and tables)
"""
print("🔧 Repairing captions that need fixing...")
# Check for commented chapters first
commented_issues = self.check_commented_chapters_in_directories(directories)
should_halt = self.print_commented_chapter_issues(commented_issues)
if should_halt:
return {}
# Build content map with filtering
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only, specific_files=specific_files)
# Check caption quality once using the existing content map
quality_report = self.check_caption_quality(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only, content_map=content_map)
# Create repair list - only items that need fixing
repair_items = []
for issue_item in quality_report['detailed_issues']:
item_id = issue_item['id']
item_type = issue_item['type']
if item_type == 'figure' and item_id in content_map.get('figures', {}):
repair_items.append(('figure', item_id, content_map['figures'][item_id]))
elif item_type == 'table' and item_id in content_map.get('tables', {}):
repair_items.append(('table', item_id, content_map['tables'][item_id]))
# If no repairs needed, exit early
if not repair_items:
print("✅ No captions need repair!")
return content_map
# Separate items by fix type and show preview
basic_fixes = []
llm_fixes = []
print(f"\n📋 Found {len(repair_items)} captions that need repair:")
print("=" * 60)
for item_type, item_id, item_data in repair_items:
# Check what type of issues this item has (from existing report)
issues = []
issue_descriptions = []
for issue_item in quality_report['detailed_issues']:
if issue_item['id'] == item_id:
issues = [issue['type'] for issue in issue_item['issues']]
issue_descriptions = [issue['description'] for issue in issue_item['issues']]
break
# Determine fix type and icon
if 'missing_bold_pattern' in issues:
llm_fixes.append((item_type, item_id, item_data))
icon = "🤖" # LLM fix needed
fix_type = "LLM regeneration"
else:
basic_fixes.append((item_type, item_id, item_data))
icon = "🔧" # Basic fix
fix_type = "Basic formatting"
# Show preview
type_icon = "📊" if item_type == "figure" else "📋"
source_file = item_data.get('source_file', 'unknown')
current_caption = item_data.get('current_caption', '')[:50] + "..." if len(item_data.get('current_caption', '')) > 50 else item_data.get('current_caption', '')
print(f"{icon} {type_icon} {item_id}")
print(f" File: {source_file}")
print(f" Current: {current_caption}")
print(f" Issues: {', '.join(issue_descriptions)}")
print(f" Fix: {fix_type}")
print()
print(f"🔧 Basic fixes: {len(basic_fixes)} items")
print(f"🤖 LLM fixes: {len(llm_fixes)} items")
print("=" * 60)
print("🚀 Starting repairs...\n")
fixed_count = 0
# Apply basic normalization fixes
for item_type, item_id, item_data in basic_fixes:
print(f"🔧 Processing {item_id}...", end="")
current_caption = item_data.get('current_caption', '')
# Apply normalization fixes
fixed_caption = self.normalize_caption_punctuation(current_caption)
fixed_caption = self.normalize_caption_case(fixed_caption)
# Only update if it actually changed
if fixed_caption != current_caption:
item_data['new_caption'] = fixed_caption
item_data['current_caption'] = fixed_caption # Update current_caption too
print(f"")
fixed_count += 1
else:
print(f" 🔄 (no change needed)")
# Apply LLM fixes for missing **Bold**: format
if llm_fixes:
print(f"🤖 Generating properly formatted captions for {len(llm_fixes)} items...")
llm_fixed = self._apply_llm_repairs(llm_fixes, content_map)
fixed_count += llm_fixed
if fixed_count > 0:
# Save updated content map
self.save_content_map(content_map)
# Update QMD files
self.process_qmd_files(directories, content_map)
# Show summary of what was changed
print("\n" + "=" * 60)
print(f"✅ Successfully repaired {fixed_count} captions!")
print(f"🔧 Basic fixes: {len([f for f in basic_fixes if f[2].get('new_caption')])}")
print(f"🤖 LLM fixes: {len([f for f in llm_fixes if f[2].get('new_caption')])}")
print("💾 Content map saved to content_map.json")
print("📝 Check git diff to see what changed in the QMD files")
print("=" * 60)
else:
print("\n✅ No automatic repairs needed!")
def _apply_llm_repairs(self, llm_fixes, content_map):
"""Apply LLM-based repairs for captions missing **Bold**: format."""
fixed_count = 0
for item_type, item_id, item_data in llm_fixes:
print(f"🤖 Generating caption for {item_id}...", end="")
try:
source_file = item_data.get('source_file')
if not source_file:
print(f" ❌ (no source file)")
continue
# Read file content for context extraction
with open(source_file, 'r', encoding='utf-8') as f:
file_content = f.read()
# Extract context around this item
context = self.extract_section_context(file_content, item_id)
current_caption = item_data.get('current_caption', '')
# Generate improved caption with appropriate parameters
if item_type == 'figure':
# Check if it's a markdown figure for image path
image_path = None
if item_data.get('type') == 'markdown':
image_pattern = rf'!\[[^\]]*\]\(([^)]+)\)[^{{]*{{[^}}]*#{re.escape(item_id)}'
match = re.search(image_pattern, file_content)
if match:
relative_path = match.group(1)
source_dir = Path(source_file).parent
image_path = str(source_dir / relative_path)
if not os.path.exists(image_path):
image_path = None
new_caption = self.generate_caption_with_ollama(
context['title'], context['content'], item_id, current_caption,
image_path, is_table=False
)
elif item_type == 'table':
new_caption = self.generate_caption_with_ollama(
context['title'], context['content'], item_id, current_caption,
None, is_table=True
)
elif item_type == 'listing':
# Get code content for listings
code_content = ""
lst_def = self.find_listing_definition_in_qmd(file_content, item_id)
if lst_def:
code_pattern = r'```[^`]*```'
code_match = re.search(code_pattern, lst_def['full_text'], re.DOTALL)
if code_match:
code_content = code_match.group(0)
new_caption = self.generate_caption_with_ollama(
context['title'], context['content'], item_id, current_caption,
None, is_table=False, is_listing=True, code_content=code_content
)
else:
continue
if new_caption and new_caption != current_caption:
item_data['new_caption'] = new_caption
item_data['current_caption'] = new_caption
word_count = len(new_caption.split())
print(f" ✅ ({word_count} words)")
fixed_count += 1
else:
print(f" 🔄 (no improvement)")
except Exception as e:
print(f" ❌ (error: {str(e)[:50]}...)")
return fixed_count
# ================================================================
# QMD-FOCUSED CONTENT MAP BUILDING
# ================================================================
def build_content_map_from_qmd(self, directories: List[str], figures_only: bool = False, tables_only: bool = False, listings_only: bool = False, specific_files: List[str] = None) -> Dict:
"""
Build comprehensive content map by scanning QMD files directly.
This QMD-focused approach:
1. Scans all .qmd files in specified directories
2. Uses specialized detection functions for each format type
3. Extracts current captions and metadata
4. Stores everything in a clean JSON structure
5. Independent of .tex builds or rendering
Args:
directories: List of directories to scan for .qmd files
figures_only: If True, only process figures (ignore tables and listings)
tables_only: If True, only process tables (ignore figures and listings)
listings_only: If True, only process listings (ignore figures and tables)
specific_files: List of specific files to include (optional)
Returns:
Dict with figures, tables, metadata, and extraction stats
"""
print(f"📄 Building content map from QMD files...")
# Check for commented chapters first (only if processing directories, not specific files)
if not specific_files and directories:
commented_issues = self.check_commented_chapters_in_directories(directories)
should_halt = self.print_commented_chapter_issues(commented_issues)
if should_halt:
return {}
# Get QMD files - use specific files if provided, otherwise scan directories
if specific_files:
qmd_files = [Path(f) for f in specific_files]
print(f"📖 Processing {len(qmd_files)} specific QMD files")
else:
qmd_files = self.find_qmd_files_in_order(directories)
print(f"📖 Scanning {len(qmd_files)} QMD files in book order")
content_map = {
'figures': {},
'tables': {},
'listings': {},
'metadata': {
'creation_time': datetime.now().isoformat(),
'source': 'qmd_direct_scan',
'directories': directories,
'qmd_files_scanned': len(qmd_files),
'extraction_stats': {
'figures_found': 0,
'tables_found': 0,
'listings_found': 0,
'markdown_figures': 0,
'tikz_figures': 0,
'r_figures': 0,
'code_figures': 0,
'extraction_failures': 0,
'failed_extractions': [],
'files_with_issues': []
}
}
}
stats = content_map['metadata']['extraction_stats']
for qmd_file in qmd_files:
try:
print(f" 📄 Scanning: {qmd_file}")
with open(qmd_file, 'r', encoding='utf-8') as f:
content = f.read()
file_figures = 0
file_tables = 0
file_listings = 0
# Find all potential figure IDs in the content using regex
# Pattern matches both {#fig-id} and #| label: fig-id formats
fig_id_pattern = r'(?:#\|\s*label:\s*(fig-[a-zA-Z0-9_-]+)|#(fig-[a-zA-Z0-9_-]+))'
matches = re.findall(fig_id_pattern, content)
# Extract non-empty matches (either group 1 or group 2)
potential_fig_ids = set()
for match in matches:
fig_id = match[0] or match[1] # Get non-empty group
if fig_id:
potential_fig_ids.add(fig_id)
# Find all potential table IDs
tbl_id_pattern = r'#(tbl-[a-zA-Z0-9_-]+)'
potential_tbl_ids = set(re.findall(tbl_id_pattern, content))
# Find all potential listing IDs
lst_id_pattern = r'#(lst-[a-zA-Z0-9_-]+)'
potential_lst_ids = set(re.findall(lst_id_pattern, content))
# Process each potential figure ID (unless tables-only or listings-only mode)
if not tables_only and not listings_only:
for fig_id in potential_fig_ids:
try:
fig_def = self.find_figure_definition_in_qmd(content, fig_id)
if fig_def:
# Store original caption as-is from the file
original_caption = fig_def['caption']
content_map['figures'][fig_id] = {
'original_caption': original_caption,
'current_caption': original_caption, # Set current_caption for quality analysis
'new_caption': '',
'type': fig_def['type'],
'source_file': qmd_file
}
print(f" ✅ Found figure: {fig_id} ({fig_def['type']})")
file_figures += 1
stats['figures_found'] += 1
# Count by type
if fig_def['type'] == 'markdown':
stats['markdown_figures'] += 1
elif fig_def['type'] == 'tikz':
stats['tikz_figures'] += 1
elif fig_def['type'] == 'r':
stats['r_figures'] += 1
elif fig_def['type'] == 'code':
# Distinguish R figures from other code figures
if fig_def.get('language') == 'r':
stats['r_figures'] += 1
else:
stats['code_figures'] += 1
else:
print(f" ⚠️ Failed to extract: {fig_id}")
stats['extraction_failures'] += 1
stats['failed_extractions'].append(fig_id)
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
except Exception as e:
print(f" ❌ Error processing {fig_id}: {e}")
stats['extraction_failures'] += 1
stats['failed_extractions'].append(fig_id)
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
# Skip messages only shown in general mode (not when using type filters)
# else:
# if tables_only:
# print(f" ⏭️ Skipping {len(potential_fig_ids)} figures (tables-only mode)")
# else: # listings_only
# print(f" ⏭️ Skipping {len(potential_fig_ids)} figures (listings-only mode)")
# Process each potential table ID (unless figures-only or listings-only mode)
if not figures_only and not listings_only:
for tbl_id in potential_tbl_ids:
try:
tbl_def = self.detect_table(content, tbl_id)
if tbl_def:
# Store original caption as-is from the file
original_caption = tbl_def['caption']
content_map['tables'][tbl_id] = {
'original_caption': original_caption,
'current_caption': original_caption, # Set current_caption for quality analysis
'new_caption': '',
'type': 'table',
'source_file': qmd_file
}
print(f" ✅ Found table: {tbl_id}")
file_tables += 1
stats['tables_found'] += 1
else:
print(f" ⚠️ Failed to extract: {tbl_id}")
stats['extraction_failures'] += 1
stats['failed_extractions'].append(tbl_id)
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
except Exception as e:
print(f" ❌ Error processing {tbl_id}: {e}")
stats['extraction_failures'] += 1
stats['failed_extractions'].append(tbl_id)
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
# Skip messages only shown in general mode (not when using type filters)
# else:
# if figures_only:
# print(f" ⏭️ Skipping {len(potential_tbl_ids)} tables (figures-only mode)")
# else: # listings_only
# print(f" ⏭️ Skipping {len(potential_tbl_ids)} tables (listings-only mode)")
# Process each potential listing ID (unless figures-only or tables-only mode, or if explicitly listings-only)
if (not figures_only and not tables_only) or listings_only:
for lst_id in potential_lst_ids:
try:
lst_def = self.find_listing_definition_enhanced(content, lst_id)
if lst_def:
# Store original caption as-is from the file
original_caption = lst_def['caption']
content_map['listings'][lst_id] = {
'original_caption': original_caption,
'current_caption': original_caption, # Set current_caption for quality analysis
'new_caption': '',
'type': 'listing',
'language': lst_def.get('language', ''),
'source_file': qmd_file,
'instances': lst_def.get('instances', []), # Store for atomic updates
'consistency_warnings': lst_def.get('consistency_warnings', [])
}
print(f" ✅ Found listing: {lst_id} ({lst_def.get('language', 'unknown')})")
file_listings += 1
stats['listings_found'] += 1
else:
print(f" ⚠️ Failed to extract: {lst_id}")
stats['extraction_failures'] += 1
stats['failed_extractions'].append(lst_id)
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
except Exception as e:
print(f" ❌ Error processing {lst_id}: {e}")
stats['extraction_failures'] += 1
stats['failed_extractions'].append(lst_id)
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
# Skip messages only shown in general mode (not when using type filters)
# else:
# if figures_only:
# print(f" ⏭️ Skipping {len(potential_lst_ids)} listings (figures-only mode)")
# else: # tables_only
# print(f" ⏭️ Skipping {len(potential_lst_ids)} listings (tables-only mode)")
# Summary for this file
if file_figures > 0 or file_tables > 0 or file_listings > 0:
print(f" 📊 File summary: {file_figures} figures, {file_tables} tables, {file_listings} listings")
except Exception as e:
print(f" ❌ Error reading {qmd_file}: {e}")
stats['extraction_failures'] += 1
if qmd_file not in stats['files_with_issues']:
stats['files_with_issues'].append(qmd_file)
# Final summary - only show relevant types based on filter mode
print(f"\n📊 QMD EXTRACTION SUMMARY:")
# Show figures info only if processing figures
if not tables_only and not listings_only:
print(f" 📊 Figures: {stats['figures_found']} found")
print(f" • Markdown: {stats['markdown_figures']}")
print(f" • TikZ: {stats['tikz_figures']}")
print(f" • R: {stats['r_figures']}")
print(f" • Code: {stats['code_figures']}")
# Show tables info only if processing tables
if not figures_only and not listings_only:
print(f" 📋 Tables: {stats['tables_found']} found")
# Show listings info only if processing listings
if not figures_only and not tables_only:
print(f" 📝 Listings: {stats['listings_found']} found")
print(f" ⚠️ Extraction failures: {stats['extraction_failures']}")
# Show specific failed extractions
if stats['extraction_failures'] > 0 and 'failed_extractions' in stats:
print(f" 📋 Failed extractions:")
for failed_id in stats['failed_extractions']:
print(f"{failed_id}")
if stats['files_with_issues']:
print(f" 📁 Files with issues: {len(stats['files_with_issues'])}")
for file in stats['files_with_issues'][:5]: # Show first 5
print(f"{file}")
if len(stats['files_with_issues']) > 5:
print(f" • ... and {len(stats['files_with_issues']) - 5} more")
# Calculate success rate based on what we're processing
processed_count = 0
if not tables_only and not listings_only:
processed_count += stats['figures_found']
if not figures_only and not listings_only:
processed_count += stats['tables_found']
if not figures_only and not tables_only:
processed_count += stats['listings_found']
total_ids = processed_count + stats['extraction_failures']
success_rate = processed_count / total_ids * 100 if total_ids > 0 else 0
print(f" ✅ Success rate: {success_rate:.1f}%")
return content_map
def process_qmd_files(self, directories: List[str], content_map: Optional[Dict] = None):
"""
Process QMD files to update captions using targeted search-and-replace.
Uses individual search-and-replace operations for each caption change
to preserve file integrity, encoding, and formatting.
Args:
directories: List of directories to process
content_map: Content map with figures and tables data (optional, will build if None)
"""
print("📝 Processing QMD files for caption updates...")
# Build content map if not provided
if content_map is None:
print("📄 Building content map from QMD files...")
content_map = self.build_content_map_from_qmd(directories)
if not content_map:
print("❌ Failed to build content map")
return
# Collect all items that need updates
updates_to_apply = []
# Collect figures that need updates
for fig_id, fig_data in content_map.get('figures', {}).items():
if 'new_caption' in fig_data and fig_data.get('new_caption'):
source_file = fig_data.get('source_file')
original_caption = fig_data.get('original_caption', '')
new_caption = fig_data.get('new_caption', '')
if source_file and original_caption and new_caption:
updates_to_apply.append({
'file': source_file,
'id': fig_id,
'type': 'figure',
'original_caption': original_caption,
'new_caption': new_caption
})
# Collect tables that need updates
for tbl_id, tbl_data in content_map.get('tables', {}).items():
if 'new_caption' in tbl_data and tbl_data.get('new_caption'):
source_file = tbl_data.get('source_file')
original_caption = tbl_data.get('original_caption', '')
new_caption = tbl_data.get('new_caption', '')
if source_file and original_caption and new_caption:
updates_to_apply.append({
'file': source_file,
'id': tbl_id,
'type': 'table',
'original_caption': original_caption,
'new_caption': new_caption
})
# Collect listings that need updates
for lst_id, lst_data in content_map.get('listings', {}).items():
if 'new_caption' in lst_data and lst_data.get('new_caption'):
source_file = lst_data.get('source_file')
original_caption = lst_data.get('original_caption', '')
new_caption = lst_data.get('new_caption', '')
if source_file and original_caption and new_caption:
updates_to_apply.append({
'file': source_file,
'id': lst_id,
'type': 'listing',
'original_caption': original_caption,
'new_caption': new_caption
})
if not updates_to_apply:
print(" No caption updates needed (no new_caption entries found)")
return
# Apply targeted search-and-replace for each update
total_successful = 0
total_failed = 0
for update in updates_to_apply:
try:
file_path = update['file']
item_id = update['id']
item_type = update['type']
original_caption = update['original_caption']
new_caption = update['new_caption']
print(f"📄 Updating {item_id} in {file_path}")
# Create targeted search pattern for this specific caption
success = self.apply_targeted_caption_update(
file_path, item_id, item_type, original_caption, new_caption
)
if success:
total_successful += 1
print(f" ✅ Updated: {item_id}")
else:
total_failed += 1
print(f" ❌ Failed: {item_id} (pattern not found)")
except Exception as e:
total_failed += 1
print(f" ❌ Error updating {update['id']}: {e}")
print(f"📊 Summary: {total_successful} successful updates, {total_failed} failed")
def apply_targeted_caption_update(self, file_path: str, item_id: str, item_type: str,
original_caption: str, new_caption: str) -> bool:
"""
Apply a single targeted caption update using precise search-and-replace.
Enhanced to handle multiple instances for listings using atomic updates.
Args:
file_path: Path to the QMD file
item_id: Figure or table ID (e.g., "fig-example", "tbl-data", "lst-code")
item_type: "figure", "table", or "listing"
original_caption: Current caption text to find
new_caption: New caption text to replace with
Returns:
True if update was successful, False otherwise
"""
try:
# Read current file content
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Use enhanced atomic update system for listings
if item_type == 'listing':
# Detect all instances of this listing
instances = self.detect_all_instances_by_id(content, item_id, "listing")
if not instances:
print(f" ⚠️ No instances found for {item_id}")
return False
# Verify captions match what we expect
canonical_caption, warnings = self.canonicalize_caption(instances)
if canonical_caption != original_caption:
print(f" ⚠️ Caption mismatch for {item_id}: expected '{original_caption}', found '{canonical_caption}'")
# Try to proceed anyway if close
if original_caption.strip() not in canonical_caption and canonical_caption.strip() not in original_caption:
return False
# Apply atomic update to all instances
updated_content, success, errors = self.update_all_instances_atomically(
content, item_id, instances, new_caption
)
if not success:
for error in errors:
print(f"{error}")
return False
# Write updated content
with open(file_path, 'w', encoding='utf-8') as f:
f.write(updated_content)
# Success message with instance count
print(f" ✅ Updated {len(instances)} instances atomically")
return True
# Fall back to original logic for figures and tables
if item_type == 'figure':
old_pattern, new_pattern = self.build_figure_search_patterns(
item_id, original_caption, new_caption, content
)
elif item_type == 'table':
old_pattern, new_pattern = self.build_table_search_patterns(
item_id, original_caption, new_caption, content
)
else:
return False
if not old_pattern:
return False
# Verify the pattern exists exactly once
if content.count(old_pattern) != 1:
print(f" ⚠️ Pattern occurs {content.count(old_pattern)} times (expected 1)")
return False
# Apply the replacement
new_content = content.replace(old_pattern, new_pattern)
# Write back the file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(new_content)
return True
except Exception as e:
print(f" ❌ Error in targeted update: {e}")
return False
def build_figure_search_patterns(self, fig_id: str, original_caption: str,
new_caption: str, content: str) -> Tuple[str, str]:
"""
Build precise search patterns for figure caption replacement.
Returns:
Tuple of (old_pattern, new_pattern) or (None, None) if not found
"""
# Try different figure formats in order of specificity
# 1. Markdown figure: ![caption](path){#fig-id}
# Use smart escaping that preserves parentheses and other normal caption characters
escaped_caption = self.escape_caption_for_regex(original_caption)
markdown_pattern = rf'!\[{escaped_caption}\](\([^)]+\)\s*\{{[^}}]*#{re.escape(fig_id)}[^}}]*\}})'
if re.search(markdown_pattern, content):
old_pattern = re.search(markdown_pattern, content).group(0)
new_pattern = f'![{new_caption}]' + re.search(markdown_pattern, content).group(1)
return old_pattern, new_pattern
# 2. TikZ figure: look for caption line in div block
escaped_caption = self.escape_caption_for_regex(original_caption)
tikz_div_pattern = rf'(:::\s*\{{[^}}]*#{re.escape(fig_id)}[^}}]*\}}.*?```\s*\n\s*){escaped_caption}(\s*)(:::)'
match = re.search(tikz_div_pattern, content, re.DOTALL)
if match:
old_pattern = match.group(0)
# Ensure there's a line break before the closing :::
line_break = match.group(2) if match.group(2) else '\n'
new_pattern = match.group(1) + new_caption + line_break + match.group(3)
return old_pattern, new_pattern
# 3. Code figure: #| fig-cap: "caption" (with R figure YAML-safe handling)
# Check if this is an R figure by looking at the code block context
r_code_pattern = rf'```\{{r[^}}]*\}}[^`]*?#\|\s*label:\s*{re.escape(fig_id)}[^`]*?#\|\s*fig-cap:\s*([^\n]+)'
r_match = re.search(r_code_pattern, content, re.DOTALL | re.MULTILINE)
if r_match:
# This is an R figure - handle YAML-safe extraction and updating
yaml_value = r_match.group(1).strip()
clean_caption = self.extract_caption_from_yaml_value(yaml_value)
if clean_caption == original_caption:
old_pattern = r_match.group(0)
# Ensure new caption is YAML-safe for R figures
yaml_safe_new_caption = self.ensure_yaml_safe_caption(new_caption)
# Replace just the caption part
prefix = old_pattern[:old_pattern.rfind(yaml_value)]
new_pattern = prefix + yaml_safe_new_caption
return old_pattern, new_pattern
# Fallback for non-R code figures
escaped_caption = self.escape_caption_for_regex(original_caption)
code_pattern = rf'(#\|\s*fig-cap:\s*["\']?){escaped_caption}(["\']?)'
if re.search(code_pattern, content):
old_pattern = re.search(code_pattern, content).group(0)
new_pattern = re.search(code_pattern, content).group(1) + new_caption + re.search(code_pattern, content).group(2)
return old_pattern, new_pattern
return None, None
def build_table_search_patterns(self, tbl_id: str, original_caption: str,
new_caption: str, content: str) -> Tuple[str, str]:
"""
Build simple search patterns for table caption replacement.
Always produces the simple format: `: [caption]. {#tbl-id [attributes]}`
Returns:
Tuple of (old_pattern, new_pattern) or (None, None) if not found
"""
# Simple approach: Find any line with the table ID and caption
# Handles both `: caption {#tbl-id}` and `caption {#tbl-id}` formats
escaped_caption = self.escape_caption_for_regex(original_caption)
pattern = rf'^:?\s*{escaped_caption}(\s*\{{[^}}]*#{re.escape(tbl_id)}[^}}]*\}})(.*)$'
match = re.search(pattern, content, re.MULTILINE)
if match:
old_pattern = match.group(0)
attributes = match.group(1) # The {#tbl-id ...} part
trailing = match.group(2) # Anything after the }
# Ensure new_caption ends with a period but avoid double periods
if not new_caption.endswith('.'):
new_caption = new_caption + '.'
# Always output the simple format: `: new_caption {#tbl-id [attributes]}`
new_pattern = f': {new_caption} {attributes.strip()}{trailing}'
return old_pattern, new_pattern
return None, None
def build_listing_search_patterns(self, lst_id: str, original_caption: str,
new_caption: str, content: str) -> Tuple[str, str]:
"""
Build precise search patterns for listing caption replacement.
Handles two patterns:
1. Traditional: #| lst-cap: "caption" or #| lst-cap: caption
2. HTML callout: title="caption"
Returns:
Tuple of (old_pattern, new_pattern) or (None, None) if not found
"""
# Method 1: Traditional lst-cap pattern
# Try quoted version first
quoted_pattern = rf'(#\|\s*lst-cap:\s*"){re.escape(original_caption)}(")'
match = re.search(quoted_pattern, content)
if match:
old_pattern = match.group(0)
new_pattern = match.group(1) + new_caption + match.group(2)
return old_pattern, new_pattern
# Try unquoted version
unquoted_pattern = rf'(#\|\s*lst-cap:\s*){re.escape(original_caption)}(\s*$)'
match = re.search(unquoted_pattern, content, re.MULTILINE)
if match:
old_pattern = match.group(0)
new_pattern = match.group(1) + new_caption + match.group(2)
return old_pattern, new_pattern
# Method 2: HTML callout title pattern
# Pattern: title="original_caption" within a div containing lst-id
title_pattern = rf'(:::\s*\{{[^}}]*#{re.escape(lst_id)}[^}}]*title\s*=\s*"){re.escape(original_caption)}("[^}}]*\}})'
match = re.search(title_pattern, content)
if match:
old_pattern = match.group(0)
new_pattern = match.group(1) + new_caption + match.group(2)
return old_pattern, new_pattern
return None, None
def improve_captions_with_llm(self, directories: List[str], content_map: Optional[Dict] = None,
figures_only: bool = False, tables_only: bool = False, listings_only: bool = False):
"""Improve captions using LLM and immediately update each file after processing."""
print("🤖 Improving captions with LLM...")
# Build content map if not provided
if content_map is None:
print("📄 Building content map from QMD files...")
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
if not content_map:
print("❌ Failed to build content map")
return {}
total_figures = len(content_map.get('figures', {}))
total_tables = len(content_map.get('tables', {}))
total_listings = len(content_map.get('listings', {}))
if total_figures == 0 and total_tables == 0 and total_listings == 0:
print("❌ No figures, tables, or listings found in content map")
return content_map
# Show processing message with only relevant types
processing_parts = []
if not tables_only and not listings_only and total_figures > 0:
processing_parts.append(f"{total_figures} figures")
if not figures_only and not listings_only and total_tables > 0:
processing_parts.append(f"{total_tables} tables")
if not figures_only and not tables_only and total_listings > 0:
processing_parts.append(f"{total_listings} listings")
if processing_parts:
print(f"📊 Processing: {', '.join(processing_parts)}")
else:
print("📊 No items to process")
# Group items by source file for efficient processing
files_to_process = {}
# Group figures by file
for fig_id, fig_data in content_map.get('figures', {}).items():
source_file = fig_data.get('source_file')
if source_file:
if source_file not in files_to_process:
files_to_process[source_file] = {'figures': [], 'tables': [], 'listings': []}
files_to_process[source_file]['figures'].append((fig_id, fig_data))
# Group tables by file
for tbl_id, tbl_data in content_map.get('tables', {}).items():
source_file = tbl_data.get('source_file')
if source_file:
if source_file not in files_to_process:
files_to_process[source_file] = {'figures': [], 'tables': [], 'listings': []}
files_to_process[source_file]['tables'].append((tbl_id, tbl_data))
# Group listings by file
for lst_id, lst_data in content_map.get('listings', {}).items():
source_file = lst_data.get('source_file')
if source_file:
if source_file not in files_to_process:
files_to_process[source_file] = {'figures': [], 'tables': [], 'listings': []}
files_to_process[source_file]['listings'].append((lst_id, lst_data))
total_improved = 0
files_updated = 0
# Process each file independently
for source_file, items in files_to_process.items():
print(f"\n📄 Processing file: {source_file}")
try:
# Read file content once for this file
with open(source_file, 'r', encoding='utf-8') as f:
file_content = f.read()
file_improvements = []
file_improved_count = 0
# Process all figures in this file
for fig_id, fig_data in items['figures']:
print(f" 📊 Processing figure: {fig_id}")
try:
# Extract context around this figure
context = self.extract_section_context(file_content, fig_id)
# Find image path if it's a markdown figure
image_path = None
if fig_data.get('type') == 'markdown':
# Try to extract image path from the figure definition
image_pattern = rf'!\[[^\]]*\]\(([^)]+)\)[^{{]*{{[^}}]*#{re.escape(fig_id)}'
match = re.search(image_pattern, file_content)
if match:
relative_path = match.group(1)
# Resolve relative to the source file directory
source_dir = Path(source_file).parent
image_path = str(source_dir / relative_path)
if not os.path.exists(image_path):
image_path = None
# Generate improved caption
current_caption = fig_data.get('original_caption', '')
new_caption = self.generate_caption_with_ollama(
context['title'],
context['content'],
fig_id,
current_caption,
image_path,
is_table=False
)
if new_caption and new_caption != current_caption:
fig_data['new_caption'] = new_caption
file_improvements.append({
'id': fig_id,
'type': 'figure',
'original': current_caption,
'new': new_caption
})
file_improved_count += 1
word_count = len(new_caption.split())
print(f" ✅ Improved ({word_count} words): {new_caption[:120]}{'...' if len(new_caption) > 120 else ''}")
else:
print(f" ⚠️ No improvement generated")
except Exception as e:
print(f" ❌ Error processing {fig_id}: {e}")
# Process all tables in this file
for tbl_id, tbl_data in items['tables']:
print(f" 📋 Processing table: {tbl_id}")
try:
# Extract context around this table
context = self.extract_section_context(file_content, tbl_id)
# Generate improved caption (no image for tables)
current_caption = tbl_data.get('original_caption', '')
new_caption = self.generate_caption_with_ollama(
context['title'],
context['content'],
tbl_id,
current_caption,
None, # No image for tables
is_table=True
)
if new_caption and new_caption != current_caption:
tbl_data['new_caption'] = new_caption
file_improvements.append({
'id': tbl_id,
'type': 'table',
'original': current_caption,
'new': new_caption
})
file_improved_count += 1
word_count = len(new_caption.split())
print(f" ✅ Improved ({word_count} words): {new_caption[:120]}{'...' if len(new_caption) > 120 else ''}")
else:
print(f" ⚠️ No improvement generated")
except Exception as e:
print(f" ❌ Error processing {tbl_id}: {e}")
# Process all listings in this file
for lst_id, lst_data in items['listings']:
print(f" 📝 Processing listing: {lst_id}")
try:
# Extract context around this listing
context = self.extract_section_context(file_content, lst_id)
# Get the actual code content from the listing
code_content = ""
lst_def = self.find_listing_definition_in_qmd(file_content, lst_id)
if lst_def:
# Extract code from the full_text
code_pattern = r'```[^`]*```'
code_match = re.search(code_pattern, lst_def['full_text'], re.DOTALL)
if code_match:
code_content = code_match.group(0)
# Generate improved caption with code content
current_caption = lst_data.get('original_caption', '')
new_caption = self.generate_caption_with_ollama(
context['title'],
context['content'],
lst_id,
current_caption,
None, # No image for listings
is_table=False,
is_listing=True,
code_content=code_content
)
if new_caption and new_caption != current_caption:
lst_data['new_caption'] = new_caption
file_improvements.append({
'id': lst_id,
'type': 'listing',
'original': current_caption,
'new': new_caption
})
file_improved_count += 1
word_count = len(new_caption.split())
print(f" ✅ Improved ({word_count} words): {new_caption[:120]}{'...' if len(new_caption) > 120 else ''}")
else:
print(f" ⚠️ No improvement generated")
except Exception as e:
print(f" ❌ Error processing {lst_id}: {e}")
# Immediately update this file if we have improvements
if file_improvements:
print(f" ✏️ Updating file with {file_improved_count} improvements...")
self.update_single_file_captions(source_file, file_improvements)
files_updated += 1
print(f" ✅ File updated successfully!")
else:
print(f" No improvements for this file - skipping update")
total_improved += file_improved_count
except Exception as e:
print(f" ❌ Error processing file {source_file}: {e}")
print(f"\n🎉 LLM improvement complete!")
print(f" 📊 Total captions improved: {total_improved}")
print(f" 📁 Files updated: {files_updated}")
return content_map
def update_single_file_captions(self, file_path: str, improvements: List[Dict]):
"""
Update a single QMD file with improved captions using targeted search-and-replace.
Args:
file_path: Path to the QMD file to update
improvements: List of dicts with 'id', 'type', 'original', 'new' keys
"""
if not improvements:
return
success_count = 0
for improvement in improvements:
item_id = improvement['id']
item_type = improvement['type']
original_caption = improvement['original']
new_caption = improvement['new']
try:
success = self.apply_targeted_caption_update(
file_path, item_id, item_type, original_caption, new_caption
)
if success:
success_count += 1
print(f" ✅ Updated {item_id}")
else:
print(f" ⚠️ Failed to update {item_id}")
except Exception as e:
print(f" ❌ Error updating {item_id}: {e}")
print(f" 📊 Successfully updated {success_count}/{len(improvements)} items in {file_path}")
def complete_caption_improvement_workflow(self, directories: List[str], save_json: bool = False, figures_only: bool = False, tables_only: bool = False, listings_only: bool = False):
"""
Complete LLM caption improvement process (used by --improve and default mode).
Process: Extract → Analyze → Improve with LLM → Update files → Validate
Args:
directories: List of directories to process
save_json: Whether to save detailed JSON output
figures_only: If True, only process figures (ignore tables and listings)
tables_only: If True, only process tables (ignore figures and listings)
listings_only: If True, only process listings (ignore figures and tables)
"""
print("🚀 Starting complete caption improvement workflow...")
# Check for commented chapters first
commented_issues = self.check_commented_chapters_in_directories(directories)
should_halt = self.print_commented_chapter_issues(commented_issues)
if should_halt:
return {}
# Step 1: Build content map with filtering
print("📄 Step 1: Building content map...")
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
if not content_map:
print("❌ Failed to build content map")
return {}
total_items = len(content_map.get('figures', {})) + len(content_map.get('tables', {}))
print(f"✅ Found {total_items} items to process")
# Optional: Save JSON for inspection
if save_json:
self.save_content_map(content_map)
print("💾 Content map saved to content_map.json for inspection")
# Step 2: Improve captions using LLM (files updated immediately during processing)
print("\n🤖 Step 2: Improving captions with LLM...")
improved_content_map = self.improve_captions_with_llm(directories, content_map, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
# Count improvements
improved_count = 0
for fig_data in improved_content_map.get('figures', {}).values():
if fig_data.get('new_caption'):
improved_count += 1
for tbl_data in improved_content_map.get('tables', {}).values():
if tbl_data.get('new_caption'):
improved_count += 1
if improved_count == 0:
print("⚠️ No captions were improved. Workflow complete.")
return improved_content_map
print(f"{improved_count} captions improved and files updated")
# Step 3: Save improvements summary to JSON file
print("\n💾 Step 3: Saving improvements summary...")
improvements_file = self.save_improvements_summary(improved_content_map, directories, improved_count, total_items)
print("\n🎉 LLM caption improvement completed successfully!")
print(f"📊 Total items processed: {total_items}")
print(f"📝 Items improved: {improved_count}")
print(f"📁 Directories: {', '.join(directories)}")
print(f"📄 Improvements saved to: {improvements_file}")
return improved_content_map
def save_improvements_summary(self, content_map: Dict, directories: List[str], improved_count: int, total_items: int) -> str:
"""
Save a comprehensive summary of caption improvements to a JSON file.
Args:
content_map: Content map with original and improved captions
directories: Directories processed
improved_count: Number of items improved
total_items: Total items processed
Returns:
Path to the saved improvements file
"""
from datetime import datetime
# Create improvements summary
improvements = {
'metadata': {
'timestamp': datetime.now().isoformat(),
'directories_processed': directories,
'total_items': total_items,
'items_improved': improved_count,
'success_rate': f"{(improved_count/total_items*100):.1f}%" if total_items > 0 else "0%",
'workflow': 'complete_caption_improvement_workflow',
'method': 'pypandoc_ast_context_extraction'
},
'improvements': {
'figures': {},
'tables': {}
},
'summary': {
'figures_improved': 0,
'tables_improved': 0,
'no_change': 0
}
}
# Process figures
for fig_id, fig_data in content_map.get('figures', {}).items():
original = fig_data.get('original_caption', '')
improved = fig_data.get('new_caption', '')
improvement_entry = {
'id': fig_id,
'type': fig_data.get('type', 'unknown'),
'source_file': fig_data.get('source_file', ''),
'original_caption': original,
'improved_caption': improved,
'status': 'improved' if improved and improved != original else 'no_change'
}
improvements['improvements']['figures'][fig_id] = improvement_entry
if improved and improved != original:
improvements['summary']['figures_improved'] += 1
else:
improvements['summary']['no_change'] += 1
# Process tables
for tbl_id, tbl_data in content_map.get('tables', {}).items():
original = tbl_data.get('original_caption', '')
improved = tbl_data.get('new_caption', '')
improvement_entry = {
'id': tbl_id,
'type': 'table',
'source_file': tbl_data.get('source_file', ''),
'original_caption': original,
'improved_caption': improved,
'status': 'improved' if improved and improved != original else 'no_change'
}
improvements['improvements']['tables'][tbl_id] = improvement_entry
if improved and improved != original:
improvements['summary']['tables_improved'] += 1
else:
improvements['summary']['no_change'] += 1
# Generate filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"caption_improvements_{timestamp}.json"
# Convert any Path objects to strings for JSON serialization
# Save to JSON file
try:
serializable_improvements = convert_paths_to_strings(improvements)
with open(filename, 'w', encoding='utf-8') as f:
json.dump(serializable_improvements, f, indent=2, ensure_ascii=False)
print(f"📄 Improvements summary saved to: {filename}")
print(f"📊 Summary: {improvements['summary']['figures_improved']} figures + {improvements['summary']['tables_improved']} tables improved")
return filename
except Exception as e:
print(f"❌ Error saving improvements summary: {e}")
return ""
def extract_references_with_pypandoc(self, content: str, qmd_file_path: str) -> Dict[str, Dict]:
"""
Use pypandoc to systematically extract figure/table references and their context.
This approach:
1. Parses QMD to structured AST using pypandoc
2. Finds all @fig-id and @tbl-id references in paragraphs
3. Extracts surrounding paragraph context systematically
4. Maps references to their #fig-id definitions
Returns:
Dict mapping figure_id -> {
'reference_context': paragraph context around @fig-id,
'definition_info': info about #fig-id definition,
'section_title': section containing the reference
}
"""
try:
# Convert QMD to JSON AST using pypandoc
ast_json = pypandoc.convert_text(
content,
'json',
format='markdown+smart',
extra_args=['--preserve-tabs']
)
ast = json.loads(ast_json)
references_map = {}
# Walk the AST to find references and context
def walk_ast(element, section_title="Unknown Section", paragraph_context=None):
if isinstance(element, dict):
element_type = element.get('t', '')
# Track section headers
if element_type == 'Header':
level = element.get('c', [None, None, []])[0]
if level == 2: # ## headers
inlines = element.get('c', [None, None, []])[2]
section_title = self._extract_text_from_inlines(inlines)
# Process paragraphs to find cross-references
elif element_type == 'Para':
para_content = element.get('c', [])
para_text = self._extract_text_from_inlines(para_content)
# Find figure/table references in this paragraph
fig_refs = re.findall(r'@(fig-[a-zA-Z0-9_-]+)', para_text)
tbl_refs = re.findall(r'@(tbl-[a-zA-Z0-9_-]+)', para_text)
for ref_id in fig_refs + tbl_refs:
if ref_id not in references_map:
# Get definition info
def_info = self.find_figure_definition_in_qmd(content, ref_id) or \
self.find_table_definition_in_qmd(content, ref_id)
references_map[ref_id] = {
'reference_paragraph': para_text,
'section_title': section_title,
'definition_info': def_info,
'file_path': qmd_file_path
}
# Recursively process content
if 'c' in element:
if isinstance(element['c'], list):
for item in element['c']:
walk_ast(item, section_title, paragraph_context)
else:
walk_ast(element['c'], section_title, paragraph_context)
elif isinstance(element, list):
for item in element:
walk_ast(item, section_title, paragraph_context)
# Start walking from the document blocks
blocks = ast.get('blocks', [])
walk_ast(blocks)
# Now get adjacent paragraph context for each reference
for ref_id, ref_data in references_map.items():
context = self._get_adjacent_paragraphs_from_ast(ast, ref_data['reference_paragraph'])
ref_data['context_paragraphs'] = context
return references_map
except Exception as e:
print(f"⚠️ pypandoc parsing failed: {e}")
print(f" Falling back to regex-based approach")
return {}
def _extract_text_from_inlines(self, inlines: List) -> str:
"""Extract plain text from pypandoc inline elements."""
text_parts = []
def extract_from_element(element):
if isinstance(element, dict):
element_type = element.get('t', '')
if element_type == 'Str':
return element.get('c', '')
elif element_type == 'Space':
return ' '
elif element_type in ['Emph', 'Strong', 'Code']:
# Extract text from emphasized/strong/code content
content = element.get('c', [])
if isinstance(content, list):
return ''.join(extract_from_element(item) for item in content)
return str(content)
elif element_type == 'Link':
# Extract text from link content (first element of c)
link_content = element.get('c', [[], '', []])[0]
return ''.join(extract_from_element(item) for item in link_content)
# Handle other inline types as needed
elif 'c' in element:
content = element['c']
if isinstance(content, list):
return ''.join(extract_from_element(item) for item in content)
return str(content)
elif isinstance(element, str):
return element
return ''
for inline in inlines:
text_parts.append(extract_from_element(inline))
return ''.join(text_parts)
def _get_adjacent_paragraphs_from_ast(self, ast: Dict, target_paragraph: str) -> List[str]:
"""
Find the target paragraph in AST and return [previous, current, next] paragraphs.
"""
blocks = ast.get('blocks', [])
paragraphs = []
# Extract all paragraph texts
def collect_paragraphs(elements):
for element in elements:
if isinstance(element, dict) and element.get('t') == 'Para':
para_text = self._extract_text_from_inlines(element.get('c', []))
paragraphs.append(para_text)
elif isinstance(element, dict) and 'c' in element:
if isinstance(element['c'], list):
collect_paragraphs(element['c'])
collect_paragraphs(blocks)
# Find target paragraph and get context
target_idx = None
for i, para in enumerate(paragraphs):
if target_paragraph in para or para in target_paragraph:
target_idx = i
break
if target_idx is None:
return [target_paragraph] # Fallback
context_paragraphs = []
# Previous paragraph
if target_idx > 0:
context_paragraphs.append(paragraphs[target_idx - 1])
# Current paragraph
context_paragraphs.append(paragraphs[target_idx])
# Next paragraph
if target_idx + 1 < len(paragraphs):
context_paragraphs.append(paragraphs[target_idx + 1])
return context_paragraphs
def check_ollama_and_model(self, model_name: str) -> bool:
"""
Check if Ollama is running and if the specified model is available.
If model doesn't exist, automatically pull it.
Returns:
True if model is ready to use, False if there are issues
"""
print(f"🔍 Checking Ollama and model: {model_name}")
try:
# Check if Ollama is running
response = requests.get("http://localhost:11434/api/tags", timeout=5)
if response.status_code != 200:
print("❌ Ollama server not responding. Please start Ollama:")
print(" brew services start ollama")
print(" # or")
print(" ollama serve")
return False
# Get list of available models
models_data = response.json()
available_models = [model['name'] for model in models_data.get('models', [])]
print(f"📦 Available models: {len(available_models)} found")
# Check if our model is available
if model_name in available_models:
print(f"✅ Model {model_name} is ready!")
return True
# Model not found - provide instructions instead of auto-pulling
print(f"❌ Model {model_name} not found locally.")
print("💡 To use this model, please run:")
print(f" ollama pull {model_name}")
print()
print("📋 Available models:")
for model in available_models[:5]: # Show first 5
print(f"{model}")
if len(available_models) > 5:
print(f" ... and {len(available_models) - 5} more")
print()
print("🔧 To see all models: ollama list")
return False
except requests.exceptions.ConnectionError:
print("❌ Cannot connect to Ollama. Please ensure Ollama is installed and running:")
print(" 1. Install: curl -fsSL https://ollama.ai/install.sh | sh")
print(" 2. Start: ollama serve")
print(f" 3. Pull model: ollama pull {model_name}")
return False
except requests.exceptions.Timeout:
print("⏰ Timeout while pulling model. Large models can take 10+ minutes.")
print("💡 Try running manually in another terminal:")
print(f" ollama pull {model_name}")
return False
except Exception as e:
print(f"❌ Unexpected error checking Ollama: {e}")
return False
def list_available_models(self) -> bool:
"""
List all available Ollama models.
Returns:
True if successful, False if there are issues
"""
try:
response = requests.get("http://localhost:11434/api/tags", timeout=5)
if response.status_code != 200:
print("❌ Ollama server not responding. Please start Ollama.")
return False
models_data = response.json()
models = models_data.get('models', [])
if not models:
print("📦 No models found. Popular models to try:")
print(" ollama pull qwen2.5:7b # Fast, good quality")
print(" ollama pull llama3.2:3b # Smaller, faster")
print(" ollama pull qwen2.5:14b # Larger, better quality")
print(" ollama pull mistral:7b # Alternative option")
return True
print(f"📦 Available Ollama Models ({len(models)} found):")
print("=" * 60)
# Sort models by size for better display
sorted_models = sorted(models, key=lambda x: x.get('size', 0))
for model in sorted_models:
name = model.get('name', 'Unknown')
size = model.get('size', 0)
size_gb = size / (1024**3) if size > 0 else 0
modified = model.get('modified_at', '')
# Format size nicely
if size_gb >= 1:
size_str = f"{size_gb:.1f}GB"
else:
size_mb = size / (1024**2)
size_str = f"{size_mb:.0f}MB"
# Format date
if modified:
try:
from datetime import datetime
dt = datetime.fromisoformat(modified.replace('Z', '+00:00'))
date_str = dt.strftime('%Y-%m-%d')
except:
date_str = modified[:10]
else:
date_str = "Unknown"
print(f" 📊 {name:<25}{size_str:>8}{date_str}")
print("=" * 60)
print("💡 Usage: python improve_figure_captions.py -d contents/vol1/ --model MODEL_NAME")
return True
except requests.exceptions.ConnectionError:
print("❌ Cannot connect to Ollama. Please ensure Ollama is running.")
return False
except Exception as e:
print(f"❌ Error listing models: {e}")
return False
def extract_section_context(self, content: str, figure_or_table_id: str) -> Dict[str, str]:
"""Extract context around a figure - now uses pypandoc AST when possible."""
# Try pypandoc approach first
try:
references = self.extract_references_with_pypandoc(content, "temp_file")
if figure_or_table_id in references:
ref_data = references[figure_or_table_id]
context_paragraphs = ref_data.get('context_paragraphs', [])
return {
'title': ref_data.get('section_title', 'Unknown Section'),
'content': '\n\n'.join(context_paragraphs)
}
except Exception as e:
print(f" ⚠️ pypandoc approach failed: {e}")
# Fallback to original paragraph-based approach
return self.extract_paragraph_context(content, figure_or_table_id)
def detect_code_listing(self, content, lst_id):
"""
Detect code listings with lst-label and lst-cap OR HTML callout with title.
Handles two patterns:
1. Traditional: #| lst-label: lst-id + #| lst-cap: Caption
2. HTML callout: ::: {#lst-id .callout-important title="Caption"}
Args:
content (str): File content
lst_id (str): Listing ID to search for (e.g., 'lst-mlp_layer_matrix')
Returns:
dict: Listing definition with caption, full_text, type='listing', language
"""
# Method 1: Traditional pattern with #| lst-label: and #| lst-cap:
# Pattern for code listings: #| lst-label: lst-id
label_pattern = rf'#\|\s*lst-label:\s*{re.escape(lst_id)}'
# Find the code block containing this label
code_block_pattern = r'```\{([^}]+)\}(.*?)```'
for block_match in re.finditer(code_block_pattern, content, re.DOTALL):
code_block = block_match.group(0)
language = block_match.group(1).strip()
# Check if this block contains our listing label
if not re.search(label_pattern, code_block):
continue
# Extract lst-cap from the code block (handle both quoted and unquoted)
# Generic regex based on actual codebase patterns:
# - Quoted: #| lst-cap: "Caption text"
# - Unquoted: #| lst-cap: Caption text
cap_pattern = r'#\|\s*lst-cap:\s*(?:"([^"]*)"|(.+))$'
cap_match = re.search(cap_pattern, code_block, re.MULTILINE)
if cap_match:
# Group 1: quoted caption, Group 2: unquoted caption
caption = (cap_match.group(1) or cap_match.group(2) or '').strip()
else:
caption = ""
result = {
'caption': caption,
'full_text': code_block,
'type': 'listing',
'language': language,
'pattern': 'traditional'
}
return result
# Method 2: HTML callout pattern with title="Caption"
# Pattern: ::: {#lst-id .callout-important title="Caption"} ... code block ... :::
callout_pattern = rf':::\s*\{{[^}}]*#{re.escape(lst_id)}[^}}]*title\s*=\s*"([^"]*)"[^}}]*\}}(.*?):::'
callout_match = re.search(callout_pattern, content, re.DOTALL)
if callout_match:
caption = callout_match.group(1).strip()
callout_content = callout_match.group(2)
full_callout = callout_match.group(0)
# Extract language from the code block within the callout
code_in_callout = re.search(r'```\{\.?([^}]+)\}', callout_content)
language = code_in_callout.group(1).strip() if code_in_callout else 'unknown'
result = {
'caption': caption,
'full_text': full_callout,
'type': 'listing',
'language': language,
'pattern': 'callout'
}
return result
return None
def find_listing_definition_in_qmd(self, content, lst_id):
"""
Find a listing definition in QMD content.
Args:
content (str): QMD file content
lst_id (str): Listing ID to search for (e.g., 'lst-mlp_layer_matrix')
Returns:
dict: Listing definition with type, caption, full_text, language
"""
# Only one detector for code listings
detectors = [
self.detect_code_listing,
]
for detector in detectors:
result = detector(content, lst_id)
if result:
return result
return None
# ================================================================
# GENERIC MULTI-INSTANCE DETECTION SYSTEM
# ================================================================
def detect_all_instances_by_id(self, content: str, item_id: str, item_type: str) -> List[Dict]:
"""
Generic method to find ALL instances of any ID (fig-, tbl-, lst-) in content.
This handles the common scenario where the same ID appears multiple times
due to HTML/PDF conditional rendering (e.g., content-visible blocks).
Args:
content: QMD file content
item_id: Full ID (e.g., "fig-example", "tbl-data", "lst-code")
item_type: Type hint ("figure", "table", "listing")
Returns:
List of all instances with their captions, formats, and locations
"""
instances = []
if item_type == "listing":
# Pattern 1: HTML callout format
# ::: {#lst-id .callout-important title="Caption"}
callout_pattern = rf':::\s*\{{[^}}]*#{re.escape(item_id)}[^}}]*title=["\']([^"\']*)["\'][^}}]*\}}'
for match in re.finditer(callout_pattern, content, re.DOTALL):
instances.append({
'format': 'html_callout',
'caption': match.group(1).strip(),
'full_text': match.group(0),
'start': match.start(),
'end': match.end(),
'pattern_type': 'callout'
})
# Pattern 2: PDF code block format
# #| lst-label: lst-id
# #| lst-cap: Caption (or "Caption")
label_pattern = rf'#\|\s*lst-label:\s*{re.escape(item_id)}'
# Find all code blocks that contain this label
code_blocks = re.finditer(r'```\{([^}]+)\}(.*?)```', content, re.DOTALL)
for code_match in code_blocks:
code_content = code_match.group(2)
if re.search(label_pattern, code_content):
# Look for lst-cap in this code block
cap_match = re.search(r'#\|\s*lst-cap:\s*(?:["\']([^"\']*)["\']|([^\n]*))', code_content)
if cap_match:
caption = (cap_match.group(1) or cap_match.group(2) or '').strip()
language = code_match.group(1).strip()
instances.append({
'format': 'pdf_code',
'caption': caption,
'full_text': code_match.group(0),
'start': code_match.start(),
'end': code_match.end(),
'pattern_type': 'code_block',
'language': language
})
elif item_type == "figure":
# TODO: Add figure detection patterns (markdown, tikz, code)
# For now, fall back to existing detection
pass
elif item_type == "table":
# TODO: Add table detection patterns
# For now, fall back to existing detection
pass
return instances
def update_all_instances_atomically(self, content: str, item_id: str,
instances: List[Dict], new_caption: str) -> Tuple[str, bool, List[str]]:
"""
Update ALL instances of an ID atomically - success/fail as a unit.
Args:
content: Original file content
item_id: ID to update
instances: List of instances from detect_all_instances_by_id
new_caption: New caption text
Returns:
Tuple of (updated_content, success, error_messages)
"""
if not instances:
return content, False, ["No instances to update"]
updated_content = content
error_messages = []
# Process instances in reverse order (by position) to maintain string positions
sorted_instances = sorted(instances, key=lambda x: x['start'], reverse=True)
for instance in sorted_instances:
try:
if instance['format'] == 'html_callout':
# Update: title="old caption" -> title="new caption"
pattern = rf'(:::\s*\{{[^}}]*#{re.escape(item_id)}[^}}]*title=["\'])([^"\']*?)(["\'][^}}]*\}})'
replacement = rf'\1{new_caption}\3'
new_content = re.sub(pattern, replacement, updated_content, count=1)
if new_content == updated_content:
error_messages.append(f"Failed to update HTML callout for {item_id}")
return content, False, error_messages
updated_content = new_content
elif instance['format'] == 'pdf_code':
# Update: #| lst-cap: old -> #| lst-cap: "new" (with quotes)
# Handle both quoted and unquoted original captions
pattern = rf'(#\|\s*lst-cap:\s*)(?:["\']([^"\']*)["\']|([^\n]*))'
replacement = rf'\1"{new_caption}"'
# Apply within the specific code block
start_pos = instance['start']
end_pos = instance['end']
block_content = updated_content[start_pos:end_pos]
new_block_content = re.sub(pattern, replacement, block_content, count=1)
if new_block_content == block_content:
error_messages.append(f"Failed to update PDF code block for {item_id}")
return content, False, error_messages
# Replace the block in the full content
updated_content = updated_content[:start_pos] + new_block_content + updated_content[end_pos:]
except Exception as e:
error_messages.append(f"Error updating {instance['format']} instance: {e}")
return content, False, error_messages
return updated_content, True, []
# ================================================================
# ENHANCED LISTING DETECTION USING GENERIC SYSTEM
# ================================================================
def find_listing_definition_enhanced(self, content: str, lst_id: str) -> Optional[Dict[str, str]]:
"""
Enhanced listing detection that handles multiple instances.
Args:
content: QMD file content
lst_id: Listing ID to search for (e.g., 'lst-mlp_layer_matrix')
Returns:
Dict with canonical caption and instance information
"""
# Use the generic detection system
instances = self.detect_all_instances_by_id(content, lst_id, "listing")
if not instances:
# Fall back to original method for compatibility
return self.find_listing_definition_in_qmd(content, lst_id)
# Get canonical caption and validate consistency
canonical_caption, warnings = self.canonicalize_caption(instances)
if warnings:
print(f" ⚠️ {lst_id}: {'; '.join(warnings)}")
# Determine primary language (prefer from code blocks)
language = 'unknown'
for instance in instances:
if instance['format'] == 'pdf_code' and 'language' in instance:
language = instance['language']
break
return {
'type': 'listing',
'caption': canonical_caption,
'language': language,
'instances': instances, # Store for later atomic updates
'consistency_warnings': warnings,
'full_text': instances[0]['full_text'] if instances else ''
}
def clean_caption_artifacts(self, caption: str) -> str:
"""
Clean unwanted artifacts from captions.
Removes patterns like:
- *Source: ...*
- "via This code snippet"
- Other common artifacts
Args:
caption: Raw caption text
Returns:
Cleaned caption text
"""
if not caption:
return caption
# Remove *Source: ...* patterns
caption = re.sub(r'\*\s*Source:\s*[^*]*\*', '', caption, flags=re.IGNORECASE)
# Remove "via This code snippet" and similar artifacts
caption = re.sub(r'via\s+this\s+code\s+snippet\.?', '', caption, flags=re.IGNORECASE)
caption = re.sub(r'through\s+the\s+code\.?', '', caption, flags=re.IGNORECASE)
# Remove multiple consecutive spaces and trim
caption = re.sub(r'\s+', ' ', caption).strip()
# Remove trailing/leading periods that might be left over
caption = re.sub(r'^\.\s*|\s*\.$', '', caption).strip()
return caption
def canonicalize_caption(self, instances: List[Dict]) -> Tuple[str, List[str]]:
"""
Choose canonical caption from multiple instances and validate consistency.
Enhanced with artifact cleanup.
Args:
instances: List of instances from detect_all_instances_by_id
Returns:
Tuple of (canonical_caption, warnings_if_inconsistent)
"""
if not instances:
return "", ["No instances found"]
if len(instances) == 1:
cleaned_caption = self.clean_caption_artifacts(instances[0]['caption'])
return cleaned_caption, []
# Clean artifacts from all captions before comparison
cleaned_instances = []
for instance in instances:
cleaned_caption = self.clean_caption_artifacts(instance['caption'])
cleaned_instances.append({**instance, 'cleaned_caption': cleaned_caption})
# Collect all unique cleaned captions
captions = [inst['cleaned_caption'] for inst in cleaned_instances if inst['cleaned_caption']]
unique_captions = list(set(captions))
warnings = []
if len(unique_captions) > 1:
warnings.append(f"Inconsistent captions found: {unique_captions}")
# Preference logic: HTML callout > PDF code block (using cleaned captions)
for instance in cleaned_instances:
if instance['format'] == 'html_callout' and instance['cleaned_caption']:
canonical = instance['cleaned_caption']
if len(unique_captions) > 1:
warnings.append(f"Using HTML callout caption: '{canonical}'")
return canonical, warnings
# Fallback to first non-empty cleaned caption
for caption in captions:
if caption:
if len(unique_captions) > 1:
warnings.append(f"Using first available caption: '{caption}'")
return caption, warnings
return "", ["No valid captions found"]
def main():
parser = argparse.ArgumentParser(
description="Improve figure and table captions using local Ollama models",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Improve captions with LLM (recommended default):
python improve_figure_captions.py -d contents/vol1/
python improve_figure_captions.py --improve -d contents/vol1/
# Using different models:
python improve_figure_captions.py -d contents/vol1/ --model llama3.2:3b
python improve_figure_captions.py -i -d contents/vol1/ -m qwen2.5:14b
python improve_figure_captions.py -d contents/vol1/ --model mistral:7b
# Content filtering:
python improve_figure_captions.py -d contents/vol1/ --figures-only
python improve_figure_captions.py -d contents/vol1/ -F # Short form
python improve_figure_captions.py -d contents/vol1/ --tables-only
python improve_figure_captions.py -d contents/vol1/ -T # Short form
python improve_figure_captions.py -d contents/vol1/ --listings-only
python improve_figure_captions.py -d contents/vol1/ -L # Short form
# Analysis and utilities:
python improve_figure_captions.py --build-map -d contents/vol1/
python improve_figure_captions.py -b -d contents/vol1/
python improve_figure_captions.py --analyze -d contents/vol1/
python improve_figure_captions.py --repair -d contents/vol1/
# Content filtering with other modes:
python improve_figure_captions.py --analyze -d contents/vol1/ --figures-only
python improve_figure_captions.py --repair -d contents/vol1/ --tables-only
python improve_figure_captions.py --build-map -d contents/vol1/ --listings-only
python improve_figure_captions.py --build-map -d contents/vol1/ -F
# Multiple directories:
python improve_figure_captions.py -d contents/vol1/ -d contents/frontmatter/ -m llama3.2:3b
# Save detailed JSON output:
python improve_figure_captions.py -d contents/vol1/ --save-json
"""
)
# Multiple file/directory input
parser.add_argument('-f', '--files', action='append',
help='Process specific QMD files (can be used multiple times)')
parser.add_argument('-d', '--directories', action='append',
help='Process directories recursively for .qmd files (can be used multiple times)')
# Mode selection
group = parser.add_mutually_exclusive_group()
group.add_argument('--build-map', '-b', action='store_true',
help='Build content map from QMD files and save to JSON')
group.add_argument('--analyze', '-a', action='store_true',
help='Analyze caption quality and validate file structure')
group.add_argument('--repair', '-r', action='store_true',
help='Repair caption formatting issues only')
group.add_argument('--improve', '-i', action='store_true',
help='Improve captions using LLM and update files (default mode)')
# Content type filtering
content_group = parser.add_mutually_exclusive_group()
content_group.add_argument('--figures-only', '-F', action='store_true',
help='Process only figures (ignore tables and listings)')
content_group.add_argument('--tables-only', '-T', action='store_true',
help='Process only tables (ignore figures and listings)')
content_group.add_argument('--listings-only', '-L', action='store_true',
help='Process only code listings (ignore figures and tables)')
# Model options
parser.add_argument('--model', '-m', default='qwen2.5:7b',
help='Ollama model to use (default: qwen2.5:7b)')
parser.add_argument('--list-models', action='store_true',
help='List available Ollama models and exit')
# Output options
parser.add_argument('--save-json', action='store_true',
help='Save detailed content map to JSON file')
args = parser.parse_args()
# Handle --list-models flag
if args.list_models:
improver = FigureCaptionImprover()
success = improver.list_available_models()
return 0 if success else 1
# Validate that we have input files/directories for other operations
if not args.files and not args.directories:
print("❌ Error: --files or --directories required")
return 1
# Determine which files/directories to process
directories = []
specific_files = []
if args.directories:
directories.extend(args.directories)
if args.files:
# Store specific files for targeted processing
specific_files.extend(args.files)
# Validate that files exist
for file in args.files:
if not Path(file).exists():
print(f"❌ File not found: {file}")
return 1
if not file.endswith('.qmd'):
print(f"❌ Not a QMD file: {file}")
return 1
# Initialize improver with specified model
improver = FigureCaptionImprover(model_name=args.model)
# Check Ollama and model availability before proceeding
if not improver.check_ollama_and_model(args.model):
print(f"❌ Cannot proceed without properly configured Ollama and model {args.model}")
return 1
try:
if args.build_map:
# Build content map and save to JSON
print("🔍 Building content map from QMD files...")
content_map = improver.build_content_map_from_qmd(directories,
figures_only=args.figures_only,
tables_only=args.tables_only,
listings_only=args.listings_only,
specific_files=specific_files)
if content_map:
print("✅ Content map building completed!")
# Always save JSON for --build-map
improver.save_content_map(content_map)
# Show extraction report
stats = content_map['metadata']['extraction_stats']
if stats['extraction_failures'] == 0:
print("🎉 Perfect extraction! All figures and tables successfully processed.")
else:
print(f"⚠️ {stats['extraction_failures']} extraction failures detected.")
print("💡 Consider reviewing the files with issues for manual fixes.")
# Show brief summary
print(f"\n📋 CONTENT SUMMARY:")
print(f" 📊 Figures: {stats['figures_found']} total")
print(f" • Markdown: {stats['markdown_figures']}")
print(f" • TikZ: {stats['tikz_figures']}")
print(f" • R: {stats['r_figures']}")
print(f" • Code: {stats['code_figures']}")
print(f" 📋 Tables: {stats['tables_found']} total")
print(f" 📁 Files processed: {content_map['metadata']['qmd_files_scanned']}")
print(f"\n💾 Content map saved to: content_map.json")
print(f"📄 You can now review the complete JSON structure!")
else:
print("❌ Content map building failed!")
return 1
elif args.analyze:
# Analyze caption quality and validate file structure
print("🔍 Analyzing caption quality and file structure...")
# Build content map for validation
content_map = improver.build_content_map_from_qmd(directories,
figures_only=args.figures_only,
tables_only=args.tables_only,
listings_only=args.listings_only,
specific_files=specific_files)
if not content_map:
print("❌ Failed to build content map for analysis")
return 1
# Check caption quality
improver.check_caption_quality(directories,
figures_only=args.figures_only,
tables_only=args.tables_only,
listings_only=args.listings_only,
content_map=content_map)
# Validate QMD mapping
improver.validate_qmd_mapping(directories, content_map)
print("✅ Analysis completed!")
elif args.repair:
# Repair caption formatting issues only
print("🔧 Repairing caption formatting issues...")
content_map = improver.repair_captions(directories,
figures_only=args.figures_only,
tables_only=args.tables_only,
listings_only=args.listings_only,
specific_files=specific_files)
if content_map and args.save_json:
improver.save_content_map(content_map)
print("💾 Repaired content map saved to content_map.json")
print("✅ Caption repair completed!")
elif args.improve:
# LLM caption improvement mode (explicit)
print("🚀 Improving captions with LLM...")
improved_content_map = improver.complete_caption_improvement_workflow(directories, args.save_json,
figures_only=args.figures_only,
tables_only=args.tables_only,
listings_only=args.listings_only)
if not improved_content_map:
return 1
else:
# Default: Same as --improve (LLM improvement)
print("🚀 Improving captions with LLM (default mode)...")
improved_content_map = improver.complete_caption_improvement_workflow(directories, args.save_json,
figures_only=args.figures_only,
tables_only=args.tables_only,
listings_only=args.listings_only)
if not improved_content_map:
return 1
except KeyboardInterrupt:
print("\n⚠️ Operation cancelled by user")
return 1
except Exception as e:
print(f"❌ Error: {e}")
return 1
return 0
if __name__ == "__main__":
main()