mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-03 08:08:51 -05:00
- Renamed vol2/advanced_intro to vol2/introduction for consistency - Updated all scripts and configs to use vol1/ instead of core/ - Updated pre-commit config to check all contents/ not just vol1/ - Updated path references in Lua filters, Python scripts, and configs
4156 lines
177 KiB
Python
Executable File
4156 lines
177 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
Figure Caption Improvement Script
|
||
|
||
A streamlined tool for improving figure and table captions in Quarto-based textbooks
|
||
using local Ollama LLM models with strong, educational language.
|
||
|
||
Main Modes:
|
||
1. --improve/-i: LLM caption improvement and file updates (default)
|
||
2. --build-map/-b: Build content map from QMD files and save to JSON
|
||
3. --analyze/-a: Quality analysis and file structure validation
|
||
4. --repair/-r: Fix formatting issues only (no LLM)
|
||
|
||
Features:
|
||
- Follows _quarto.yml chapter ordering
|
||
- 100% extraction success (270 figures, 92 tables)
|
||
- Strong language improvements (removes weak starters)
|
||
- Proper formatting (spacing, capitalization, table prefixes)
|
||
- Context-aware processing with paragraph-level analysis
|
||
"""
|
||
|
||
import argparse
|
||
import base64
|
||
import json
|
||
import os
|
||
import re
|
||
import requests
|
||
import subprocess
|
||
import yaml
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Tuple
|
||
from datetime import datetime
|
||
from titlecase import titlecase
|
||
import pypandoc
|
||
|
||
class CaptionQualityChecker:
|
||
"""Analyzes caption quality and identifies issues."""
|
||
|
||
def __init__(self):
|
||
self.quality_rules = {
|
||
'missing_punctuation': self._check_punctuation,
|
||
'poor_capitalization': self._check_capitalization,
|
||
'too_generic': self._check_generic,
|
||
'missing_bold_pattern': self._check_bold_pattern,
|
||
'broken_formatting': self._check_formatting
|
||
}
|
||
|
||
def _check_punctuation(self, caption: str) -> Tuple[bool, str]:
|
||
"""Check if caption ends with proper punctuation."""
|
||
if not caption or caption.strip().endswith(('.', '!', '?')):
|
||
return True, ""
|
||
return False, "Missing period"
|
||
|
||
def _check_capitalization(self, caption: str) -> Tuple[bool, str]:
|
||
"""Check if capitalization follows style guide."""
|
||
if not caption:
|
||
return True, ""
|
||
|
||
# Check for obvious issues like all caps or all lowercase
|
||
if caption.isupper():
|
||
return False, "All caps"
|
||
if caption.islower() and not caption.startswith('**'):
|
||
return False, "All lowercase"
|
||
|
||
# More sophisticated checks could be added here
|
||
return True, ""
|
||
|
||
def _check_generic(self, caption: str) -> Tuple[bool, str]:
|
||
"""Check for overly generic captions."""
|
||
if not caption:
|
||
return True, ""
|
||
|
||
generic_patterns = [
|
||
r'^Figure shows',
|
||
r'^Figure \d+',
|
||
r'^Table shows',
|
||
r'^Table \d+',
|
||
r'^Diagram of',
|
||
r'^Image of',
|
||
r'^Screenshot',
|
||
r'^Example$',
|
||
r'^Overview$',
|
||
r'^Comparison$',
|
||
]
|
||
|
||
for pattern in generic_patterns:
|
||
if re.search(pattern, caption, re.IGNORECASE):
|
||
return False, "Too generic"
|
||
|
||
return True, ""
|
||
|
||
def _check_bold_pattern(self, caption: str) -> Tuple[bool, str]:
|
||
"""Check if caption follows **Concept**: explanation pattern."""
|
||
if not caption:
|
||
return False, "Empty caption"
|
||
|
||
caption = caption.strip()
|
||
|
||
# Check for **Bold**: explanation pattern
|
||
bold_pattern = r'^\*\*[^*]+\*\*:'
|
||
if not re.match(bold_pattern, caption):
|
||
return False, "Missing **Bold**: explanation format"
|
||
|
||
# Check minimum length even with format
|
||
if len(caption) < 20:
|
||
return False, "Too short even with proper format"
|
||
|
||
return True, ""
|
||
|
||
def _check_formatting(self, caption: str) -> Tuple[bool, str]:
|
||
"""Check for broken markdown formatting."""
|
||
if not caption:
|
||
return True, ""
|
||
|
||
# Check for unmatched bold markers
|
||
bold_count = caption.count('**')
|
||
if bold_count % 2 != 0:
|
||
return False, "Unmatched ** markers"
|
||
|
||
# Check for other formatting issues
|
||
if '{{' in caption or '}}' in caption:
|
||
return False, "LaTeX artifacts"
|
||
|
||
return True, ""
|
||
|
||
def analyze_caption(self, caption: str) -> Dict[str, any]:
|
||
"""Analyze a single caption and return quality report."""
|
||
issues = []
|
||
suggestions = []
|
||
|
||
for rule_name, rule_func in self.quality_rules.items():
|
||
is_good, issue_desc = rule_func(caption)
|
||
if not is_good:
|
||
issues.append({
|
||
'type': rule_name,
|
||
'description': issue_desc
|
||
})
|
||
|
||
return {
|
||
'caption': caption,
|
||
'issues': issues,
|
||
'needs_repair': len(issues) > 0,
|
||
'suggestions': suggestions
|
||
}
|
||
|
||
class FigureCaptionImprover:
|
||
"""
|
||
Main class for improving figure and table captions using local Ollama LLM models.
|
||
|
||
Provides streamlined modes:
|
||
- build_map: Extract content structure and save to JSON
|
||
- analyze: Quality analysis and file validation
|
||
- repair: Fix formatting issues only
|
||
- improve: Complete LLM caption improvement (default)
|
||
|
||
Features:
|
||
- Follows _quarto.yml chapter ordering
|
||
- 100% extraction success rate (270 figures, 92 tables)
|
||
- Strong language improvements with educational focus
|
||
- Proper formatting and spacing normalization
|
||
- Context-aware processing with retry logic
|
||
"""
|
||
|
||
def __init__(self, model_name="qwen2.5:7b"):
|
||
self.model_name = model_name
|
||
self.figure_pattern = re.compile(r'@fig-([a-zA-Z0-9_-]+)')
|
||
self.stats = {
|
||
'files_processed': 0,
|
||
'figures_found': 0,
|
||
'figures_improved': 0,
|
||
'tables_found': 0,
|
||
'tables_improved': 0,
|
||
'images_found': 0,
|
||
'images_missing': 0,
|
||
'json_success': 0,
|
||
'json_failed': 0,
|
||
'errors': []
|
||
}
|
||
self.content_map_file = "content_map.json"
|
||
self.quality_checker = CaptionQualityChecker()
|
||
self.quarto_config_file = "_quarto.yml"
|
||
|
||
def find_qmd_files(self, directory: str) -> List[Path]:
|
||
"""Find all .qmd files in a directory recursively."""
|
||
directory_path = Path(directory)
|
||
return list(directory_path.rglob("*.qmd"))
|
||
|
||
def get_book_chapters_from_quarto(self) -> Dict[str, List[str]]:
|
||
"""Parse _quarto.yml and return active and commented chapter files."""
|
||
if not os.path.exists(self.quarto_config_file):
|
||
print(f"❌ Quarto config not found: {self.quarto_config_file}")
|
||
return {'active': [], 'commented': []}
|
||
|
||
try:
|
||
with open(self.quarto_config_file, 'r', encoding='utf-8') as f:
|
||
config = yaml.safe_load(f)
|
||
|
||
chapters = config.get('book', {}).get('chapters', [])
|
||
active_chapters = []
|
||
|
||
for chapter in chapters:
|
||
# Handle different chapter formats
|
||
if isinstance(chapter, str):
|
||
# Simple string chapter (e.g., "index.qmd")
|
||
if chapter.endswith('.qmd'):
|
||
active_chapters.append(chapter)
|
||
elif isinstance(chapter, dict):
|
||
# Part or complex chapter structure
|
||
if 'part' in chapter and chapter['part'].endswith('.qmd'):
|
||
active_chapters.append(chapter['part'])
|
||
# Could add more complex handling here if needed
|
||
|
||
# Also read the raw file to find commented chapters
|
||
with open(self.quarto_config_file, 'r', encoding='utf-8') as f:
|
||
raw_content = f.read()
|
||
|
||
# Find commented chapter lines
|
||
commented_chapters = []
|
||
for line in raw_content.split('\n'):
|
||
line = line.strip()
|
||
if line.startswith('# - ') and line.endswith('.qmd'):
|
||
# Remove '# - ' prefix and clean up
|
||
commented_chapter = line[4:].strip()
|
||
commented_chapters.append(commented_chapter)
|
||
|
||
print(f"📚 Found {len(active_chapters)} active chapters, {len(commented_chapters)} commented chapters")
|
||
return {
|
||
'active': active_chapters,
|
||
'commented': commented_chapters
|
||
}
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error parsing {self.quarto_config_file}: {e}")
|
||
return {'active': [], 'commented': []}
|
||
|
||
def find_qmd_files_in_order(self, directories: List[str]) -> List[Path]:
|
||
"""Find QMD files following the book's chapter order from _quarto.yml."""
|
||
book_structure = self.get_book_chapters_from_quarto()
|
||
active_chapters = book_structure.get('active', [])
|
||
|
||
if not active_chapters:
|
||
print("⚠️ No book structure found, falling back to directory scan")
|
||
# Fallback to original method
|
||
all_files = []
|
||
for directory in directories:
|
||
all_files.extend(self.find_qmd_files(directory))
|
||
return all_files
|
||
|
||
# Filter book chapters to only those in specified directories
|
||
filtered_chapters = []
|
||
directory_set = {os.path.normpath(d) for d in directories}
|
||
|
||
for chapter_path in active_chapters:
|
||
chapter_full_path = Path(chapter_path)
|
||
|
||
# Check if this chapter is within any of the specified directories
|
||
for directory in directory_set:
|
||
try:
|
||
# Try to see if chapter is within directory
|
||
chapter_full_path.relative_to(directory)
|
||
if chapter_full_path.exists():
|
||
filtered_chapters.append(chapter_full_path)
|
||
break
|
||
except ValueError:
|
||
# Not within this directory, continue
|
||
continue
|
||
|
||
print(f"📖 Processing {len(filtered_chapters)} chapters in book order")
|
||
return filtered_chapters
|
||
|
||
def check_commented_chapters_in_directories(self, directories: List[str]) -> Dict:
|
||
"""Check if any chapters are commented out within the target directories."""
|
||
book_structure = self.get_book_chapters_from_quarto()
|
||
commented_chapters = book_structure.get('commented', [])
|
||
|
||
# Normalize directory paths
|
||
directory_set = {os.path.normpath(d) for d in directories}
|
||
|
||
issues = {
|
||
'commented_in_target_dirs': [],
|
||
'total_issues': 0,
|
||
'should_halt': False
|
||
}
|
||
|
||
# Check if any commented chapters are within our target directories
|
||
for commented_chapter in commented_chapters:
|
||
chapter_path = Path(commented_chapter)
|
||
|
||
# Check if this commented chapter is within any target directory
|
||
for directory in directory_set:
|
||
try:
|
||
chapter_path.relative_to(directory)
|
||
# If we get here, the commented chapter is within this directory
|
||
issues['commented_in_target_dirs'].append({
|
||
'chapter': commented_chapter,
|
||
'directory': directory
|
||
})
|
||
break
|
||
except ValueError:
|
||
# Not within this directory, continue
|
||
continue
|
||
|
||
issues['total_issues'] = len(issues['commented_in_target_dirs'])
|
||
issues['should_halt'] = issues['total_issues'] > 0
|
||
|
||
return issues
|
||
|
||
def print_commented_chapter_issues(self, issues: Dict):
|
||
"""Print issues about commented chapters and halt if necessary."""
|
||
if issues['total_issues'] == 0:
|
||
return False
|
||
|
||
print(f"\n🚨 CRITICAL ISSUE:")
|
||
print(f"Found {issues['total_issues']} commented chapters in target directories")
|
||
print(f"Processing cannot continue as QMD files will be inconsistent.")
|
||
|
||
print(f"\n📁 Commented chapters in target directories:")
|
||
for item in issues['commented_in_target_dirs']:
|
||
print(f" • {item['chapter']} (in directory: {item['directory']})")
|
||
|
||
print(f"\n💡 To fix:")
|
||
print(f" 1. Uncomment these chapters in _quarto.yml, OR")
|
||
print(f" 2. Exclude these directories from processing, OR")
|
||
print(f" 3. Run 'quarto render --to=titlepage-pdf' after uncommenting")
|
||
print(f"\n❌ HALTING EXECUTION - Please resolve these issues first.")
|
||
|
||
return True # Should halt
|
||
|
||
def normalize_caption_punctuation(self, caption: str) -> str:
|
||
"""Ensure caption ends with a period for academic formatting."""
|
||
if not caption:
|
||
return caption
|
||
|
||
caption = caption.strip()
|
||
|
||
# Don't add period if already ends with punctuation
|
||
if caption and not caption.endswith(('.', '!', '?')):
|
||
caption += '.'
|
||
|
||
return caption
|
||
|
||
def normalize_caption_case(self, caption: str) -> str:
|
||
"""Normalize caption case using proper sentence case for technical content."""
|
||
if not caption:
|
||
return caption
|
||
|
||
# Check if this follows **Bold**: explanation format
|
||
bold_pattern = r'^(\*\*[^*]+\*\*:\s*)(.+)$'
|
||
match = re.match(bold_pattern, caption.strip())
|
||
|
||
if match:
|
||
# Only apply sentence case to the explanation part after the colon
|
||
bold_part = match.group(1) # **Bold**:
|
||
explanation_part = match.group(2) # explanation text
|
||
fixed_explanation = self.apply_sentence_case(explanation_part)
|
||
return bold_part + fixed_explanation
|
||
else:
|
||
# Apply sentence case to entire caption if not in **Bold**: format
|
||
return self.apply_sentence_case(caption)
|
||
|
||
def apply_sentence_case(self, text: str) -> str:
|
||
"""
|
||
Apply proper sentence case using comprehensive rules for technical content.
|
||
|
||
Rules:
|
||
- Capitalize first word
|
||
- Preserve proper nouns, acronyms, and technical terms
|
||
- Handle contractions and possessives correctly
|
||
- Don't lowercase words that should stay capitalized
|
||
"""
|
||
if not text:
|
||
return text
|
||
|
||
# Comprehensive list of terms to preserve (case-sensitive)
|
||
preserve_exact = {
|
||
# Technical acronyms
|
||
'AI', 'ML', 'IoT', 'GPU', 'CPU', 'API', 'UI', 'UX', 'PDF', 'HTML', 'JSON', 'XML',
|
||
'HTTP', 'HTTPS', 'SQL', 'NoSQL', 'REST', 'SOAP', 'TCP', 'UDP', 'IP', 'DNS',
|
||
'TinyML', 'MLOps', 'DevOps', 'CI/CD', 'SDK', 'IDE', 'CLI', 'GUI',
|
||
|
||
# Companies and products
|
||
'AlexNet', 'FarmBeats', 'TikZ', 'LaTeX', 'GitHub', 'YouTube', 'Microsoft',
|
||
'Google', 'Amazon', 'Facebook', 'Netflix', 'Tesla', 'OpenAI', 'NVIDIA',
|
||
'PyTorch', 'TensorFlow', 'Keras', 'Scikit-learn',
|
||
|
||
# Research terms
|
||
'CNN', 'RNN', 'LSTM', 'GRU', 'BERT', 'GPT', 'ResNet', 'VGG', 'YOLO',
|
||
'IoU', 'mAP', 'BLEU', 'ROUGE', 'F1', 'ROC', 'AUC', 'MSE', 'RMSE',
|
||
'SGD', 'Adam', 'AdaGrad', 'RMSprop',
|
||
|
||
# File formats and standards
|
||
'PNG', 'JPEG', 'SVG', 'CSV', 'YAML', 'TOML', 'HDF5', 'ONNX',
|
||
'Docker', 'Kubernetes', 'AWS', 'GCP', 'Azure', 'S3',
|
||
|
||
# Programming languages and tools
|
||
'Python', 'JavaScript', 'TypeScript', 'Java', 'C++', 'C#', 'Go', 'Rust',
|
||
'React', 'Vue', 'Angular', 'Node.js', 'MongoDB', 'PostgreSQL', 'Redis'
|
||
}
|
||
|
||
# Split into words while preserving spaces and punctuation
|
||
tokens = re.findall(r'\b\w+(?:\'\w+)?\b|\s+|[^\w\s]', text)
|
||
result_tokens = []
|
||
word_index = 0 # Track actual words (not spaces/punctuation)
|
||
|
||
for token in tokens:
|
||
if re.match(r'\s+', token): # Preserve whitespace
|
||
result_tokens.append(token)
|
||
elif not re.match(r'\w', token): # Preserve punctuation
|
||
result_tokens.append(token)
|
||
else: # Process words
|
||
# First word is always capitalized
|
||
if word_index == 0:
|
||
# Check if it's a preserved term first
|
||
if token.upper() in [p.upper() for p in preserve_exact]:
|
||
# Find the exact preserved form
|
||
preserved_form = next(p for p in preserve_exact if p.upper() == token.upper())
|
||
result_tokens.append(preserved_form)
|
||
else:
|
||
result_tokens.append(token.capitalize())
|
||
else:
|
||
# Check if word should be preserved as-is
|
||
if token.upper() in [p.upper() for p in preserve_exact]:
|
||
# Find the exact preserved form
|
||
preserved_form = next(p for p in preserve_exact if p.upper() == token.upper())
|
||
result_tokens.append(preserved_form)
|
||
elif token.isupper() and len(token) > 1:
|
||
# Preserve all-caps words (likely acronyms)
|
||
result_tokens.append(token)
|
||
else:
|
||
# Apply lowercase for regular words
|
||
result_tokens.append(token.lower())
|
||
|
||
word_index += 1
|
||
|
||
return ''.join(result_tokens)
|
||
|
||
def format_bold_explanation_caption(self, caption: str) -> str:
|
||
"""
|
||
Format caption to ensure proper **bold**: explanation capitalization.
|
||
Bold part: Title Case, Explanation part: Proper sentence case
|
||
"""
|
||
if not caption or '**' not in caption or ':' not in caption:
|
||
return caption
|
||
|
||
# Parse **bold**: explanation format
|
||
match = re.match(r'^\*\*([^*]+)\*\*:\s*(.+)$', caption.strip())
|
||
if not match:
|
||
return caption
|
||
|
||
bold_part = match.group(1).strip()
|
||
explanation_part = match.group(2).strip()
|
||
|
||
# Apply title case to bold part
|
||
bold_part = titlecase(bold_part)
|
||
|
||
# Apply proper sentence case to explanation part
|
||
explanation_part = self.apply_sentence_case(explanation_part)
|
||
|
||
return f"**{bold_part}**: {explanation_part}"
|
||
|
||
def fix_capitalization_after_periods(self, text: str) -> str:
|
||
"""
|
||
Fix capitalization after periods, handling edge cases properly.
|
||
"""
|
||
if not text:
|
||
return text
|
||
|
||
# Common abbreviations that shouldn't trigger capitalization of next word
|
||
abbreviations = {
|
||
'dr.', 'prof.', 'mr.', 'mrs.', 'ms.', 'vs.', 'etc.', 'i.e.', 'e.g.',
|
||
'fig.', 'tbl.', 'eq.', 'sec.', 'ch.', 'vol.', 'no.', 'p.', 'pp.',
|
||
'ml.', 'ai.', 'gpu.', 'cpu.', 'api.', 'url.', 'http.', 'https.'
|
||
}
|
||
|
||
# Split into sentences while preserving the structure
|
||
sentences = re.split(r'(\. )', text)
|
||
result_parts = []
|
||
|
||
for i, part in enumerate(sentences):
|
||
if i == 0:
|
||
# First part - capitalize first letter
|
||
if part and part[0].islower():
|
||
part = part[0].upper() + part[1:]
|
||
elif part == '. ' and i + 1 < len(sentences):
|
||
# Period followed by space - check next part
|
||
next_part = sentences[i + 1] if i + 1 < len(sentences) else ""
|
||
if next_part:
|
||
# Check if previous word was an abbreviation
|
||
prev_parts = ''.join(sentences[:i])
|
||
words = prev_parts.split()
|
||
last_word = words[-1].lower() if words else ""
|
||
|
||
# If not an abbreviation, capitalize next sentence
|
||
if last_word not in abbreviations:
|
||
if next_part and next_part[0].islower():
|
||
sentences[i + 1] = next_part[0].upper() + next_part[1:]
|
||
|
||
result_parts.append(part)
|
||
|
||
return ''.join(result_parts)
|
||
|
||
def improve_sentence_starters(self, text: str) -> str:
|
||
"""
|
||
Replace weak sentence starters and mid-sentence weak patterns with stronger, more direct language.
|
||
"""
|
||
if not text:
|
||
return text
|
||
|
||
# Split into sentences and improve each one
|
||
sentences = re.split(r'(\. )', text)
|
||
improved_sentences = []
|
||
|
||
for sentence in sentences:
|
||
if sentence == '. ':
|
||
improved_sentences.append(sentence)
|
||
continue
|
||
|
||
original = sentence.strip()
|
||
if not original:
|
||
improved_sentences.append(sentence)
|
||
continue
|
||
|
||
# Apply improvements to this sentence
|
||
improved = original
|
||
|
||
# Patterns for beginning-of-sentence weak starters
|
||
beginning_patterns = [
|
||
# "Illustrates how X" -> "X" (direct approach)
|
||
(r'^illustrates how (.+)$', r'\1'),
|
||
(r'^shows how (.+)$', r'\1'),
|
||
(r'^demonstrates how (.+)$', r'\1'),
|
||
(r'^depicts how (.+)$', r'\1'),
|
||
(r'^reveals how (.+)$', r'\1'),
|
||
(r'^highlights how (.+)$', r'\1'),
|
||
(r'^visualizes how (.+)$', r'\1'),
|
||
(r'^exemplifies how (.+)$', r'\1'),
|
||
(r'^traces how (.+)$', r'\1'),
|
||
(r'^explains how (.+)$', r'\1'),
|
||
(r'^displays how (.+)$', r'\1'),
|
||
(r'^presents how (.+)$', r'\1'),
|
||
|
||
# "Illustrates the X" -> "The X" (remove weak verb)
|
||
(r'^illustrates the (.+)$', r'The \1'),
|
||
(r'^shows the (.+)$', r'The \1'),
|
||
(r'^demonstrates the (.+)$', r'The \1'),
|
||
(r'^depicts the (.+)$', r'The \1'),
|
||
(r'^reveals the (.+)$', r'The \1'),
|
||
(r'^highlights the (.+)$', r'The \1'),
|
||
(r'^visualizes the (.+)$', r'The \1'),
|
||
(r'^exemplifies the (.+)$', r'The \1'),
|
||
(r'^traces the (.+)$', r'The \1'),
|
||
(r'^explains the (.+)$', r'The \1'),
|
||
(r'^displays the (.+)$', r'The \1'),
|
||
(r'^presents the (.+)$', r'The \1'),
|
||
|
||
# Generic weak starters at beginning - remove entirely
|
||
(r'^illustrates (.+)$', r'\1'),
|
||
(r'^shows (.+)$', r'\1'),
|
||
(r'^demonstrates (.+)$', r'\1'),
|
||
(r'^depicts (.+)$', r'\1'),
|
||
(r'^reveals (.+)$', r'\1'),
|
||
(r'^highlights (.+)$', r'\1'),
|
||
(r'^visualizes (.+)$', r'\1'),
|
||
(r'^exemplifies (.+)$', r'\1'),
|
||
(r'^traces (.+)$', r'\1'),
|
||
(r'^explains (.+)$', r'\1'),
|
||
(r'^displays (.+)$', r'\1'),
|
||
(r'^presents (.+)$', r'\1'),
|
||
]
|
||
|
||
# Apply beginning-of-sentence patterns first
|
||
for pattern, replacement in beginning_patterns:
|
||
if re.search(pattern, improved, re.IGNORECASE):
|
||
improved = re.sub(pattern, replacement, improved, flags=re.IGNORECASE)
|
||
break
|
||
|
||
# Patterns for mid-sentence weak constructions
|
||
mid_sentence_patterns = [
|
||
# "X illustrates how Y" -> stronger constructions
|
||
(r'(.+?)\s+illustrates how (.+)', r'\2 through \1'),
|
||
(r'(.+?)\s+demonstrates how (.+)', r'\2 via \1'),
|
||
(r'(.+?)\s+depicts how (.+)', r'\2 using \1'),
|
||
(r'(.+?)\s+reveals how (.+)', r'\2 through \1'),
|
||
(r'(.+?)\s+highlights how (.+)', r'\2 via \1'),
|
||
(r'(.+?)\s+visualizes how (.+)', r'\2 through \1'),
|
||
(r'(.+?)\s+exemplifies how (.+)', r'\2 via \1'),
|
||
(r'(.+?)\s+traces how (.+)', r'\2 through \1'),
|
||
(r'(.+?)\s+explains how (.+)', r'\2 via \1'),
|
||
(r'(.+?)\s+displays how (.+)', r'\2 using \1'),
|
||
(r'(.+?)\s+presents how (.+)', r'\2 through \1'),
|
||
|
||
# "X illustrates that Y" -> "X confirms that Y" / "X establishes that Y"
|
||
(r'(.+?)\s+illustrates that (.+)', r'\1 confirms that \2'),
|
||
(r'(.+?)\s+demonstrates that (.+)', r'\1 establishes that \2'),
|
||
(r'(.+?)\s+depicts that (.+)', r'\1 confirms that \2'),
|
||
(r'(.+?)\s+reveals that (.+)', r'\1 establishes that \2'),
|
||
(r'(.+?)\s+highlights that (.+)', r'\1 emphasizes that \2'),
|
||
(r'(.+?)\s+visualizes that (.+)', r'\1 confirms that \2'),
|
||
(r'(.+?)\s+exemplifies that (.+)', r'\1 establishes that \2'),
|
||
(r'(.+?)\s+traces that (.+)', r'\1 confirms that \2'),
|
||
(r'(.+?)\s+explains that (.+)', r'\1 establishes that \2'),
|
||
(r'(.+?)\s+displays that (.+)', r'\1 confirms that \2'),
|
||
(r'(.+?)\s+presents that (.+)', r'\1 establishes that \2'),
|
||
|
||
# "X illustrates Y" -> "X enables Y" / "X provides Y"
|
||
(r'(.+?)\s+illustrates (.+)', r'\1 enables \2'),
|
||
(r'(.+?)\s+demonstrates (.+)', r'\1 provides \2'),
|
||
(r'(.+?)\s+depicts (.+)', r'\1 presents \2'),
|
||
(r'(.+?)\s+reveals (.+)', r'\1 exposes \2'),
|
||
(r'(.+?)\s+highlights (.+)', r'\1 emphasizes \2'),
|
||
(r'(.+?)\s+visualizes (.+)', r'\1 presents \2'),
|
||
(r'(.+?)\s+exemplifies (.+)', r'\1 provides \2'),
|
||
(r'(.+?)\s+traces (.+)', r'\1 reveals \2'),
|
||
(r'(.+?)\s+explains (.+)', r'\1 clarifies \2'),
|
||
(r'(.+?)\s+displays (.+)', r'\1 presents \2'),
|
||
(r'(.+?)\s+presents (.+)', r'\1 provides \2'),
|
||
]
|
||
|
||
# Apply mid-sentence patterns
|
||
for pattern, replacement in mid_sentence_patterns:
|
||
if re.search(pattern, improved, re.IGNORECASE):
|
||
improved = re.sub(pattern, replacement, improved, flags=re.IGNORECASE)
|
||
break
|
||
|
||
# Ensure first letter is capitalized
|
||
if improved and improved[0].islower():
|
||
improved = improved[0].upper() + improved[1:]
|
||
|
||
# Add the improved sentence (spacing will be normalized later)
|
||
improved_sentences.append(improved)
|
||
|
||
return ''.join(improved_sentences)
|
||
|
||
def normalize_spacing(self, text: str) -> str:
|
||
"""
|
||
Normalize spacing in text - remove multiple spaces, leading/trailing spaces.
|
||
"""
|
||
if not text:
|
||
return text
|
||
|
||
# Replace multiple spaces with single space
|
||
text = re.sub(r'\s+', ' ', text)
|
||
|
||
# Remove leading/trailing spaces
|
||
text = text.strip()
|
||
|
||
return text
|
||
|
||
def escape_caption_for_regex(self, caption: str) -> str:
|
||
"""
|
||
Escape caption text for use in regex patterns.
|
||
|
||
Unlike re.escape(), this only escapes true regex metacharacters
|
||
that would break pattern matching, while preserving normal text
|
||
characters like parentheses () which are common in captions.
|
||
|
||
Args:
|
||
caption: Caption text that may contain special characters
|
||
|
||
Returns:
|
||
Caption with only problematic regex chars escaped
|
||
"""
|
||
if not caption:
|
||
return caption
|
||
|
||
# Only escape characters that are actual regex metacharacters
|
||
# and would break pattern matching. Common caption chars like () are preserved.
|
||
# Regex metacharacters to escape: . ^ $ * + ? { } [ ] \ |
|
||
metacharacters = r'\.^$*+?{}[]\\|'
|
||
escaped = ''
|
||
for char in caption:
|
||
if char in metacharacters:
|
||
escaped += '\\' + char
|
||
else:
|
||
escaped += char
|
||
return escaped
|
||
|
||
def ensure_yaml_safe_caption(self, caption: str) -> str:
|
||
"""
|
||
Ensure caption is safe for YAML parsing by adding quotes when needed.
|
||
|
||
YAML interprets text starting with ** as alias references, which causes parsing errors.
|
||
This function adds quotes around captions that start with ** to prevent YAML issues.
|
||
|
||
Args:
|
||
caption: The caption text to make YAML-safe
|
||
|
||
Returns:
|
||
Caption with proper YAML quoting if needed
|
||
"""
|
||
if not caption:
|
||
return caption
|
||
|
||
caption = caption.strip()
|
||
|
||
# Check if caption starts with ** (which causes YAML parsing issues)
|
||
if caption.startswith('**'):
|
||
# Check if it's already properly quoted
|
||
if (caption.startswith('"') and caption.endswith('"')) or \
|
||
(caption.startswith("'") and caption.endswith("'")):
|
||
# Already quoted, return as-is
|
||
return caption
|
||
else:
|
||
# Add double quotes to make it YAML-safe
|
||
# Escape any existing double quotes within the caption
|
||
escaped_caption = caption.replace('"', '\\"')
|
||
return f'"{escaped_caption}"'
|
||
|
||
return caption
|
||
|
||
def extract_caption_from_yaml_value(self, yaml_value: str) -> str:
|
||
"""
|
||
Extract the actual caption text from a YAML value, handling quoted and unquoted cases.
|
||
|
||
Args:
|
||
yaml_value: The YAML value which might be quoted or unquoted
|
||
|
||
Returns:
|
||
The clean caption text without YAML quoting
|
||
"""
|
||
if not yaml_value:
|
||
return yaml_value
|
||
|
||
yaml_value = yaml_value.strip()
|
||
|
||
# Handle double quotes
|
||
if yaml_value.startswith('"') and yaml_value.endswith('"'):
|
||
# Remove quotes and unescape any escaped quotes
|
||
clean_caption = yaml_value[1:-1].replace('\\"', '"')
|
||
return clean_caption
|
||
|
||
# Handle single quotes
|
||
if yaml_value.startswith("'") and yaml_value.endswith("'"):
|
||
# Remove quotes and handle single quote escaping
|
||
clean_caption = yaml_value[1:-1].replace("''", "'")
|
||
return clean_caption
|
||
|
||
# Not quoted, return as-is
|
||
return yaml_value
|
||
|
||
def validate_and_improve_caption(self, caption: str, is_table: bool = False) -> str:
|
||
"""
|
||
Apply all quality improvements to a caption.
|
||
|
||
Args:
|
||
caption: The caption to improve
|
||
is_table: Whether this is a table caption (affects format requirements)
|
||
|
||
Returns:
|
||
Improved caption with proper capitalization, strong language, and format
|
||
"""
|
||
if not caption:
|
||
return caption
|
||
|
||
# Normalize input spacing first
|
||
caption = self.normalize_spacing(caption)
|
||
|
||
# Check if caption already has table prefix and remove it for processing
|
||
has_table_prefix = caption.startswith(': ')
|
||
if has_table_prefix:
|
||
caption = caption[2:] # Remove ': ' prefix
|
||
|
||
# Clean up any extra leading colons and spaces (handles edge cases like ::, : :, etc.)
|
||
caption = re.sub(r'^:+\s*', '', caption)
|
||
|
||
# Parse **bold**: explanation format (handle spaces around colon)
|
||
match = re.match(r'^(\*\*[^*]+\*\*)\s*:\s*(.+)$', caption)
|
||
if not match:
|
||
return caption
|
||
|
||
bold_part = match.group(1)
|
||
explanation = match.group(2)
|
||
|
||
# Improve sentence starters in explanation
|
||
explanation = self.improve_sentence_starters(explanation)
|
||
|
||
# Fix capitalization after periods
|
||
explanation = self.fix_capitalization_after_periods(explanation)
|
||
|
||
# Normalize spacing in explanation
|
||
explanation = self.normalize_spacing(explanation)
|
||
|
||
# Combine with proper single space after colon
|
||
improved = f"{bold_part}: {explanation}"
|
||
|
||
return improved
|
||
|
||
def encode_image(self, image_path: str) -> Optional[str]:
|
||
"""Encode an image to base64 for multimodal models."""
|
||
try:
|
||
with open(image_path, "rb") as image_file:
|
||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||
except Exception as e:
|
||
print(f"❌ Error encoding image {image_path}: {e}")
|
||
return None
|
||
|
||
def extract_paragraph_context(self, content: str, figure_or_table_id: str) -> Dict[str, str]:
|
||
"""Extract focused context: paragraph with figure + adjacent paragraphs."""
|
||
|
||
# Split content into paragraphs (double newlines or section breaks)
|
||
# Keep section headers with their following content
|
||
paragraphs = re.split(r'\n\s*\n', content)
|
||
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
||
|
||
# Find which paragraph contains our figure/table
|
||
target_paragraph_idx = None
|
||
section_title = "Unknown Section"
|
||
|
||
for i, paragraph in enumerate(paragraphs):
|
||
# Check if this paragraph contains the figure definition or reference
|
||
if (figure_or_table_id in paragraph or
|
||
f"@{figure_or_table_id}" in paragraph):
|
||
target_paragraph_idx = i
|
||
break
|
||
|
||
if target_paragraph_idx is None:
|
||
# Fallback: search in full content and extract around location
|
||
return self._extract_fallback_context(content, figure_or_table_id)
|
||
|
||
# Extract section title from content before the target paragraph
|
||
full_content_before = '\n\n'.join(paragraphs[:target_paragraph_idx + 1])
|
||
section_headers = re.findall(r'^##\s+([^#\n]+?)(?:\s*\{#[^}]+\}.*)?$',
|
||
full_content_before, re.MULTILINE)
|
||
if section_headers:
|
||
section_title = section_headers[-1].strip() # Get the most recent header
|
||
|
||
# Collect context paragraphs: [previous, current, next]
|
||
context_paragraphs = []
|
||
|
||
# Previous paragraph (if exists)
|
||
if target_paragraph_idx > 0:
|
||
prev_para = paragraphs[target_paragraph_idx - 1]
|
||
# Skip if it's just a section header
|
||
if not re.match(r'^##\s+', prev_para):
|
||
context_paragraphs.append(prev_para)
|
||
|
||
# Current paragraph (with the figure)
|
||
context_paragraphs.append(paragraphs[target_paragraph_idx])
|
||
|
||
# Next paragraph (if exists)
|
||
if target_paragraph_idx + 1 < len(paragraphs):
|
||
next_para = paragraphs[target_paragraph_idx + 1]
|
||
# Skip if it's a section header
|
||
if not re.match(r'^##\s+', next_para):
|
||
context_paragraphs.append(next_para)
|
||
|
||
context_content = '\n\n'.join(context_paragraphs)
|
||
|
||
return {
|
||
'title': section_title,
|
||
'content': context_content
|
||
}
|
||
|
||
def _extract_fallback_context(self, content: str, figure_or_table_id: str) -> Dict[str, str]:
|
||
"""Fallback: extract ±300 words around figure reference."""
|
||
lines = content.split('\n')
|
||
section_title = "Unknown Section"
|
||
|
||
for i, line in enumerate(lines):
|
||
if figure_or_table_id in line or f"@{figure_or_table_id}" in line:
|
||
# Find section heading
|
||
for j in range(i, -1, -1):
|
||
check_line = lines[j].strip()
|
||
if check_line.startswith('##') and not check_line.startswith('###'):
|
||
section_title = re.sub(r'^#+\s*', '', check_line)
|
||
section_title = re.sub(r'\s*\{#[^}]+\}.*$', '', section_title)
|
||
break
|
||
|
||
# Extract context around reference (±10 lines, then expand to word boundaries)
|
||
start_idx = max(0, i - 10)
|
||
end_idx = min(len(lines), i + 10)
|
||
context_text = '\n'.join(lines[start_idx:end_idx]).strip()
|
||
|
||
# Limit to ~300 words around the figure
|
||
words = context_text.split()
|
||
if len(words) > 300:
|
||
# Find the figure reference position in words
|
||
fig_word_pos = None
|
||
for word_idx, word in enumerate(words):
|
||
if figure_or_table_id in word:
|
||
fig_word_pos = word_idx
|
||
break
|
||
|
||
if fig_word_pos:
|
||
start_word = max(0, fig_word_pos - 150)
|
||
end_word = min(len(words), fig_word_pos + 150)
|
||
context_text = ' '.join(words[start_word:end_word])
|
||
|
||
return {
|
||
'title': section_title,
|
||
'content': context_text
|
||
}
|
||
|
||
# Ultimate fallback
|
||
return {
|
||
'title': "Unknown Section",
|
||
'content': content[:1000]
|
||
}
|
||
|
||
|
||
def generate_caption_with_ollama(self, section_title: str, section_text: str,
|
||
figure_id: str, current_caption: str,
|
||
image_path: Optional[str] = None, is_table: bool = False,
|
||
is_listing: bool = False, code_content: Optional[str] = None) -> Optional[str]:
|
||
"""Generate improved caption using Ollama multimodal model with retry logic."""
|
||
import time
|
||
|
||
# Construct a focused, context-aware prompt
|
||
content_type = "code listing" if is_listing else "visual (figure or table)"
|
||
|
||
prompt = f"""You are an expert at editing a caption for a {content_type} in a technical AI/ML systems textbook.
|
||
|
||
Your task is to improve the caption so that it *teaches*. The goal is to help students understand what the {"code demonstrates or implements" if is_listing else "visual conveys"} in the context of machine learning systems.
|
||
|
||
SECTION: {section_title}
|
||
ORIGINAL CAPTION: {current_caption}
|
||
|
||
TEXTBOOK CONTEXT (for reference):
|
||
{section_text[:1500]}
|
||
|
||
{f'''CODE CONTENT:
|
||
{code_content[:1500] if code_content else "No code content provided"}
|
||
|
||
''' if is_listing else ""}
|
||
|
||
🧠 TASK: Rewrite the caption to make it educational, precise, and aligned with the visual's teaching purpose.
|
||
|
||
✍️ FORMAT:
|
||
**<Key Phrase>**: Explanation sentence(s)
|
||
|
||
🚫 CRITICAL RULE - NEVER START WITH WEAK DESCRIPTIVE VERBS:
|
||
You must NEVER begin explanation sentences with these weak words:
|
||
"Shows", "Demonstrates", "Illustrates", "Depicts", "Reveals", "Highlights", "Displays", "Presents", "Exhibits", "Portrays", "Visualizes", "Exemplifies", "Traces", "Explains"
|
||
|
||
Instead, write DIRECT, ACTIVE statements:
|
||
❌ BAD: "Shows how neural networks process data"
|
||
✅ GOOD: "Neural networks process data through layered transformations"
|
||
❌ BAD: "Demonstrates the relationship between accuracy and efficiency"
|
||
✅ GOOD: "Higher model accuracy requires more computational resources"
|
||
|
||
✅ REQUIREMENTS:
|
||
|
||
1. **Key Phrase**: A single bolded noun phrase (1–5 words) that captures the main idea. Avoid full sentences or multiple bolded phrases. If similar figures exist in this section, choose a unique but relevant phrase.
|
||
|
||
2. **Explanation**: 1–2 concise, natural sentences that express what the student *learns* from the figure or table. Use active voice. Avoid simply describing what the figure "shows"—explain what *insight* it provides or how it advances understanding.
|
||
|
||
3. **Terminology**: Use domain-specific language from the original caption if helpful, but rephrase it to clarify meaning for students.
|
||
|
||
4. **No Weak Openers**: Do not begin with "This figure...", "This table...", or "This diagram...". Begin with the concept or the takeaway.
|
||
|
||
5. **Clarity & Precision**: Be specific, pedagogical, and concrete. Emphasize learning outcomes over general description.
|
||
|
||
6. **Tone**: Use a textbook tone. Use technical but student-friendly language appropriate for upper-level undergraduates or early graduate students. Avoid jargon unless it is defined or central to the concept.
|
||
|
||
7. **Sources**: If the original caption includes a source (e.g., "Source: IEEE Spectrum"), retain it at the end of the caption in italics. Append it after a period.
|
||
|
||
📌 EXCELLENT TEXTBOOK EXAMPLES:
|
||
**Attention Mechanism**: Transformer models compute attention through query-key-value interactions, enabling dynamic focus across input sequences for improved language understanding.
|
||
**Farm Edge Integration**: Modern agricultural systems deploy AI directly on IoT devices to process sensor data locally, reducing latency and improving real-time decision making.
|
||
**Training Pipeline**: Machine learning workflows partition datasets into training, validation, and test sets to ensure robust model development and unbiased evaluation.
|
||
|
||
🚫 AVOID:
|
||
- Starting with "This figure shows..." or "This table illustrates..."
|
||
- Using a full sentence or list as the bold key phrase
|
||
- Repeating the section title or being too vague (e.g., **AI System**)
|
||
|
||
🖊️ OUTPUT: Write only the improved caption below:
|
||
|
||
🚫 CRITICAL: NEVER START WITH WEAK VERBS:
|
||
- BANNED WORDS: "Shows", "Demonstrates", "Illustrates", "Depicts", "Reveals", "Highlights", "Displays", "Presents", "Exhibits", "Portrays", "Visualizes", "Exemplifies", "Traces", "Explains"
|
||
- BANNED PHRASES: "This figure/table/diagram...", "As shown in...", "The illustration demonstrates...", "The visual depicts..."
|
||
- These make captions sound like descriptions, not teaching tools
|
||
|
||
✅ STRONG SENTENCE PATTERNS TO USE:
|
||
- Direct statements: "Neural networks process data through..."
|
||
- System descriptions: "The architecture combines..."
|
||
- Process explanations: "Training requires..."
|
||
- Comparative insights: "Edge computing reduces latency while..."
|
||
- Technical definitions: "Convolutional layers extract..."
|
||
- Causal relationships: "Larger models achieve higher accuracy but..."
|
||
|
||
💡 BEFORE vs AFTER EXAMPLES:
|
||
❌ WEAK: "Illustrates how neural networks process data"
|
||
✅ STRONG: "Neural networks process input data through hierarchical feature extraction"
|
||
|
||
❌ WEAK: "Shows the relationship between accuracy and efficiency"
|
||
✅ STRONG: "Model accuracy increases with computational complexity, creating efficiency trade-offs"
|
||
|
||
❌ WEAK: "Demonstrates edge computing benefits"
|
||
✅ STRONG: "Edge computing reduces latency by processing data locally rather than in the cloud"
|
||
|
||
❌ WEAK: "Visualizes the ML pipeline stages"
|
||
✅ STRONG: "Machine learning pipelines consist of data preprocessing, training, and deployment phases"
|
||
|
||
🎯 FINAL REMINDER: Write the explanation sentence(s) with DIRECT, ACTIVE language. NO weak descriptive verbs!
|
||
"""
|
||
|
||
# Retry logic: up to 3 attempts with exponential backoff
|
||
max_retries = 2
|
||
base_delay = 1 # seconds
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
# Prepare the request payload
|
||
payload = {
|
||
"model": self.model_name,
|
||
"prompt": prompt,
|
||
"stream": False,
|
||
"options": {
|
||
"temperature": 0.7, # Higher temperature for more diverse, creative captions
|
||
"num_predict": -1, # No token limit - generate complete responses
|
||
"top_p": 0.9 # Add nucleus sampling for better variety
|
||
}
|
||
}
|
||
|
||
# Add image if provided (for multimodal models)
|
||
if image_path and os.path.exists(image_path):
|
||
encoded_image = self.encode_image(image_path)
|
||
if encoded_image:
|
||
payload["images"] = [encoded_image]
|
||
|
||
# Make request to Ollama
|
||
response = requests.post(
|
||
"http://localhost:11434/api/generate",
|
||
json=payload,
|
||
timeout=60
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
result = response.json()
|
||
new_caption = result.get('response', '').strip()
|
||
|
||
# Clean up any markdown code blocks
|
||
if new_caption.startswith('```') and new_caption.endswith('```'):
|
||
new_caption = new_caption.strip('`').strip()
|
||
if new_caption.startswith('json\n'):
|
||
new_caption = new_caption[5:].strip()
|
||
|
||
# Log word count but never reject based on length
|
||
word_count = len(new_caption.split())
|
||
if word_count > 150:
|
||
print(f" ℹ️ Long caption generated ({word_count} words): {new_caption[:100]}...")
|
||
# Continue processing - user wants NO truncation limits
|
||
|
||
# Validate the format contains **bold**:
|
||
if '**' in new_caption and ':' in new_caption:
|
||
# Apply comprehensive quality improvements
|
||
formatted_caption = self.format_bold_explanation_caption(new_caption)
|
||
improved_caption = self.validate_and_improve_caption(formatted_caption, is_table)
|
||
|
||
# Log final word count but never reject
|
||
final_word_count = len(improved_caption.split())
|
||
if final_word_count > 150:
|
||
print(f" ℹ️ Large improved caption ({final_word_count} words): continuing anyway...")
|
||
|
||
return improved_caption
|
||
else:
|
||
print(f" ⚠️ Generated caption doesn't follow **bold**: format: {new_caption[:100]}")
|
||
# Don't retry for format issues - this is a generation problem, not API error
|
||
return None
|
||
else:
|
||
# API error - this is worth retrying
|
||
if attempt < max_retries - 1:
|
||
delay = base_delay * (2 ** attempt)
|
||
print(f" ⚠️ Ollama API error {response.status_code}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
|
||
time.sleep(delay)
|
||
continue
|
||
else:
|
||
print(f" ❌ Ollama API error: {response.status_code} (all {max_retries} attempts failed)")
|
||
return None
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
# Network/connection error - worth retrying
|
||
if attempt < max_retries - 1:
|
||
delay = base_delay * (2 ** attempt)
|
||
print(f" ⚠️ Request error: {e}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
|
||
time.sleep(delay)
|
||
continue
|
||
else:
|
||
print(f" ❌ Request error: {e} (all {max_retries} attempts failed)")
|
||
return None
|
||
except Exception as e:
|
||
# Unexpected error - worth retrying once but likely a code issue
|
||
if attempt < max_retries - 1:
|
||
delay = base_delay * (2 ** attempt)
|
||
print(f" ⚠️ Unexpected error: {e}, retrying in {delay}s (attempt {attempt + 1}/{max_retries})")
|
||
time.sleep(delay)
|
||
continue
|
||
else:
|
||
print(f" ❌ Unexpected error: {e} (all {max_retries} attempts failed)")
|
||
return None
|
||
|
||
# Should never reach here due to the loop structure, but just in case
|
||
return None
|
||
|
||
def compile_tikz_to_image(self, tikz_code: str, figure_id: str) -> Optional[str]:
|
||
"""Compile TikZ code to a PNG image for multimodal processing."""
|
||
temp_dir = Path("temp_tikz")
|
||
temp_dir.mkdir(exist_ok=True)
|
||
|
||
tex_file = temp_dir / f"{figure_id}.tex"
|
||
pdf_file = temp_dir / f"{figure_id}.pdf"
|
||
png_file = temp_dir / f"{figure_id}.png"
|
||
|
||
# Create minimal LaTeX document with TikZ
|
||
latex_content = f"""\\documentclass{{standalone}}
|
||
\\usepackage{{tikz}}
|
||
\\usepackage{{pgfplots}}
|
||
\\usetikzlibrary{{positioning,arrows,shapes,calc}}
|
||
\\begin{{document}}
|
||
{tikz_code}
|
||
\\end{{document}}"""
|
||
|
||
try:
|
||
# Write LaTeX file
|
||
with open(tex_file, 'w', encoding='utf-8') as f:
|
||
f.write(latex_content)
|
||
|
||
# Compile to PDF
|
||
result = subprocess.run(
|
||
["pdflatex", "-output-directory", str(temp_dir), str(tex_file)],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=30
|
||
)
|
||
|
||
if result.returncode != 0 or not pdf_file.exists():
|
||
print(f" ❌ LaTeX compilation failed for {figure_id}")
|
||
return None
|
||
|
||
# Convert PDF to PNG using ImageMagick
|
||
result = subprocess.run(
|
||
["magick", "convert", "-density", "150", str(pdf_file), str(png_file)],
|
||
capture_output=True,
|
||
text=True,
|
||
timeout=15
|
||
)
|
||
|
||
if result.returncode != 0 or not png_file.exists():
|
||
print(f" ❌ PDF to PNG conversion failed for {figure_id}")
|
||
return None
|
||
|
||
return str(png_file)
|
||
|
||
except subprocess.TimeoutExpired:
|
||
print(f" ❌ Compilation timeout for {figure_id}")
|
||
return None
|
||
except Exception as e:
|
||
print(f" ❌ Compilation error for {figure_id}: {e}")
|
||
return None
|
||
finally:
|
||
# Clean up intermediate files
|
||
for temp_file in [tex_file, pdf_file, tex_file.with_suffix('.aux'),
|
||
tex_file.with_suffix('.log')]:
|
||
if temp_file.exists():
|
||
try:
|
||
temp_file.unlink()
|
||
except:
|
||
pass
|
||
|
||
def parse_sections(self, content: str) -> List[Dict[str, any]]:
|
||
"""Parse QMD content to extract sections (content-based, no line numbers)."""
|
||
# Find all section headers using regex
|
||
section_pattern = r'^##\s+([^#\n]+?)(?:\s*\{#[^}]+\}.*)?$'
|
||
sections = []
|
||
|
||
# Split content by section headers
|
||
parts = re.split(section_pattern, content, flags=re.MULTILINE)
|
||
|
||
if len(parts) > 1:
|
||
# First part is content before any section (if any)
|
||
if parts[0].strip():
|
||
sections.append({
|
||
'title': 'Introduction',
|
||
'content': parts[0].strip()
|
||
})
|
||
|
||
# Process section pairs (title, content)
|
||
for i in range(1, len(parts), 2):
|
||
if i + 1 < len(parts):
|
||
title = parts[i].strip()
|
||
content_part = parts[i + 1].strip()
|
||
sections.append({
|
||
'title': title,
|
||
'content': content_part
|
||
})
|
||
|
||
return sections
|
||
|
||
def load_content_map(self) -> Dict:
|
||
"""Load existing content map from JSON file."""
|
||
if not os.path.exists(self.content_map_file):
|
||
print(f"❌ Content map not found: {self.content_map_file}")
|
||
print(f"💡 Run: python {os.path.basename(__file__)} --build-map")
|
||
return {}
|
||
|
||
try:
|
||
with open(self.content_map_file, 'r', encoding='utf-8') as f:
|
||
content_map = json.load(f)
|
||
print(f"📋 Loaded content map: {len(content_map.get('figures', {}))} figures, {len(content_map.get('tables', {}))} tables")
|
||
return content_map
|
||
except Exception as e:
|
||
error_msg = f"Error loading content map: {e}"
|
||
print(f"❌ {error_msg}")
|
||
self.stats['errors'].append(error_msg)
|
||
return {}
|
||
|
||
def save_content_map(self, content_map: Dict):
|
||
"""Save content map to JSON file with proper serialization."""
|
||
def convert_paths_to_strings(obj):
|
||
"""Recursively convert Path objects to strings for JSON serialization."""
|
||
if isinstance(obj, Path):
|
||
return str(obj)
|
||
elif isinstance(obj, dict):
|
||
return {key: convert_paths_to_strings(value) for key, value in obj.items()}
|
||
elif isinstance(obj, list):
|
||
return [convert_paths_to_strings(item) for item in obj]
|
||
else:
|
||
return obj
|
||
|
||
try:
|
||
# Convert any Path objects to strings
|
||
serializable_map = convert_paths_to_strings(content_map)
|
||
|
||
with open(self.content_map_file, 'w', encoding='utf-8') as f:
|
||
json.dump(serializable_map, f, indent=2, ensure_ascii=False)
|
||
print(f"💾 Content map saved to: {self.content_map_file}")
|
||
except Exception as e:
|
||
print(f"❌ Error saving content map: {e}")
|
||
raise
|
||
|
||
def find_figure_definition_in_qmd(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Unified figure detection across all supported formats.
|
||
|
||
Detects 4 types of figures in order of specificity:
|
||
1. Code-generated figures: ```{r|python} #| label: fig-id #| fig-cap: "caption" ...
|
||
2. TikZ/Div figures: ::: {#fig-id} ```{.tikz} ... ``` caption :::
|
||
3. Markdown figures: {#fig-id}
|
||
|
||
Note: R figures are detected as 'code' type with language='r'
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-example")
|
||
|
||
Returns:
|
||
Dict with type-specific information or None if not found
|
||
"""
|
||
# Try each detection method in order of specificity
|
||
detectors = [
|
||
self.detect_code_figure,
|
||
self.detect_tikz_figure,
|
||
self.detect_markdown_figure
|
||
]
|
||
|
||
for detector in detectors:
|
||
result = detector(content, fig_id)
|
||
if result:
|
||
return result
|
||
return None
|
||
|
||
def detect_markdown_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Detect standard markdown figures with precise pattern matching.
|
||
|
||
Required format: {#fig-id optional-attributes}
|
||
|
||
Examples from the codebase:
|
||
{#fig-example}
|
||
{#fig-example}
|
||
![Caption with [citation]](path.jpg){#fig-example width=40%}
|
||
{#fig-example width=95% fig-pos='H'}
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-example")
|
||
|
||
Returns:
|
||
Dict with 'caption', 'path', 'full_match' or None if not found
|
||
"""
|
||
# Precise pattern based on actual codebase format:
|
||
# {#fig-id optional-attributes}
|
||
# Handle nested brackets in captions and ensure exact ID match
|
||
escaped_fig_id = re.escape(fig_id)
|
||
pattern = rf'!\[((?:[^\[\]]|\[[^\]]*\])*)\]\(([^)]+)\)\s*\{{\s*#{escaped_fig_id}(?=\s|}})[^}}]*\}}'
|
||
|
||
match = re.search(pattern, content, re.MULTILINE)
|
||
if not match:
|
||
return None
|
||
|
||
caption = match.group(1).strip()
|
||
path = match.group(2).strip()
|
||
full_text = match.group(0)
|
||
|
||
return {
|
||
'type': 'markdown',
|
||
'caption': caption,
|
||
'path': path,
|
||
'full_match': full_text,
|
||
'start': match.start(),
|
||
'end': match.end()
|
||
}
|
||
|
||
def detect_r_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Detect R code block figures with labels and fig-cap.
|
||
|
||
Required format:
|
||
```{r}
|
||
#| label: fig-id
|
||
#| fig-cap: "Caption text"
|
||
#| echo: false (optional)
|
||
... R code ...
|
||
```
|
||
|
||
Examples from the codebase:
|
||
```{r}
|
||
#| label: fig-imagenet-challenge
|
||
#| fig-cap: "ImageNet accuracy improvements over the years."
|
||
#| echo: false
|
||
... R plotting code ...
|
||
```
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-imagenet-challenge")
|
||
|
||
Returns:
|
||
Dict with 'caption', 'r_code', 'full_match' or None if not found
|
||
"""
|
||
# Pattern to match R code blocks with specific label and fig-cap
|
||
# Handle both quoted and unquoted fig-cap values
|
||
escaped_fig_id = re.escape(fig_id)
|
||
|
||
# Try pattern with label first, then fig-cap (with or without quotes)
|
||
pattern = rf'```\{{r\}}(.*?)#\|\s*label:\s*{escaped_fig_id}\s*\n(.*?)#\|\s*fig-cap:\s*(?:["\']([^"\']*)["\']|([^\n]*))([^`]*?)```'
|
||
|
||
match = re.search(pattern, content, re.DOTALL)
|
||
if match:
|
||
# Extract caption from either quoted (group 3) or unquoted (group 4)
|
||
caption = (match.group(3) or match.group(4) or '').strip()
|
||
else:
|
||
# Try alternative pattern where fig-cap comes before label
|
||
pattern = rf'```\{{r\}}(.*?)#\|\s*fig-cap:\s*(?:["\']([^"\']*)["\']|([^\n]*?))\s*\n(.*?)#\|\s*label:\s*{escaped_fig_id}([^`]*?)```'
|
||
match = re.search(pattern, content, re.DOTALL)
|
||
if match:
|
||
# Extract caption from either quoted (group 2) or unquoted (group 3)
|
||
caption = (match.group(2) or match.group(3) or '').strip()
|
||
else:
|
||
return None
|
||
|
||
# Extract the full R code
|
||
r_code = match.group(0)[7:-3] # Remove ```{r} and ``` wrappers
|
||
|
||
return {
|
||
'type': 'r',
|
||
'caption': caption,
|
||
'r_code': r_code.strip(),
|
||
'full_match': match.group(0),
|
||
'start': match.start(),
|
||
'end': match.end()
|
||
}
|
||
|
||
def detect_tikz_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Detect TikZ/Div block figures with robust structure matching.
|
||
|
||
Required structure:
|
||
::: {attributes with #fig-id}
|
||
```{.tikz}
|
||
% TikZ code here
|
||
```
|
||
Caption text here
|
||
:::
|
||
|
||
Examples from the codebase:
|
||
::: {#fig-neural-network}
|
||
::: {width=80% #fig-neural-network}
|
||
::: {#fig-neural-network .column-margin}
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-neural-network")
|
||
|
||
Returns:
|
||
Dict with 'caption', 'tikz_code', 'full_match' or None if not found
|
||
"""
|
||
# Robust pattern: Must have div block with EXACT fig-id AND tikz code block
|
||
# This ensures we only match actual TikZ figures, not other div blocks
|
||
pattern = rf':::\s*\{{[^}}]*#{re.escape(fig_id)}(?=\s|[}}])[^}}]*\}}\s*\n(.*?):::'
|
||
|
||
match = re.search(pattern, content, re.DOTALL)
|
||
if not match:
|
||
return None
|
||
|
||
div_content = match.group(1)
|
||
|
||
# Verify this actually contains a TikZ code block
|
||
tikz_match = re.search(r'```\s*\{\.tikz\}\s*(.*?)\s*```', div_content, re.DOTALL)
|
||
if not tikz_match:
|
||
return None # Not a TikZ figure if no tikz code block
|
||
|
||
tikz_code = tikz_match.group(1).strip()
|
||
|
||
# Extract caption: text after the tikz block but before :::
|
||
tikz_end = tikz_match.end()
|
||
caption_text = div_content[tikz_end:].strip()
|
||
|
||
# Clean up caption - remove empty lines and trailing whitespace
|
||
caption_lines = [line.strip() for line in caption_text.split('\n') if line.strip()]
|
||
caption = ' '.join(caption_lines) if caption_lines else ""
|
||
|
||
return {
|
||
'type': 'tikz',
|
||
'caption': caption,
|
||
'tikz_code': tikz_code,
|
||
'full_match': match.group(0),
|
||
'start': match.start(),
|
||
'end': match.end()
|
||
}
|
||
|
||
def detect_code_figure(self, content: str, fig_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Detect code-generated figures (R/Python blocks).
|
||
|
||
Format:
|
||
```{r}
|
||
#| label: fig-id
|
||
#| fig-cap: "Caption text"
|
||
#| other-options: values
|
||
|
||
# R or Python code here
|
||
ggplot(data) + ...
|
||
```
|
||
|
||
Examples:
|
||
```{r}
|
||
#| label: fig-datacenter-energy
|
||
#| fig-cap: "Energy usage over time"
|
||
#| echo: false
|
||
|
||
library(ggplot2)
|
||
ggplot(data, aes(x=year, y=usage)) + geom_line()
|
||
```
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-datacenter-energy")
|
||
|
||
Returns:
|
||
Dict with 'caption', 'code', 'language', 'full_match' or None if not found
|
||
"""
|
||
# Pattern: ```{r|python} ... #| label: fig-id ... #| fig-cap: "caption" ... ```
|
||
pattern = rf'```\{{(r|python)[^}}]*\}}([^`]*?#\|\s*label:\s*{re.escape(fig_id)}[^`]*?)```'
|
||
match = re.search(pattern, content, re.DOTALL | re.MULTILINE)
|
||
|
||
if match:
|
||
language = match.group(1)
|
||
code_block = match.group(2)
|
||
|
||
# Extract fig-cap from the code block (handle both quoted and unquoted)
|
||
# For R figures: Handle YAML-quoted values properly (captions starting with **)
|
||
# For other languages: Use simpler extraction
|
||
if language == 'r':
|
||
# Enhanced regex to handle YAML-quoted values for R figures
|
||
cap_pattern = r'#\|\s*fig-cap:\s*(.+)$'
|
||
cap_match = re.search(cap_pattern, code_block, re.MULTILINE)
|
||
if cap_match:
|
||
# Extract the full YAML value and then clean it
|
||
yaml_value = cap_match.group(1).strip()
|
||
# Remove quotes if they exist and extract clean caption
|
||
caption = self.extract_caption_from_yaml_value(yaml_value)
|
||
else:
|
||
caption = ""
|
||
else:
|
||
# Standard extraction for non-R figures
|
||
cap_pattern = r'#\|\s*fig-cap:\s*(?:"([^"]*)"|(.+))$'
|
||
cap_match = re.search(cap_pattern, code_block, re.MULTILINE)
|
||
if cap_match:
|
||
# Group 1: quoted caption, Group 2: unquoted caption
|
||
caption = (cap_match.group(1) or cap_match.group(2) or '').strip()
|
||
else:
|
||
caption = ""
|
||
|
||
return {
|
||
'type': 'code',
|
||
'caption': caption,
|
||
'language': language,
|
||
'code': code_block.strip(),
|
||
'full_match': match.group(0),
|
||
'start': match.start(),
|
||
'end': match.end()
|
||
}
|
||
return None
|
||
|
||
def detect_table(self, content: str, tbl_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Detect table captions.
|
||
|
||
Supports both formats:
|
||
- Old: : Caption text {#tbl-id}
|
||
- New: Caption text {#tbl-id}
|
||
|
||
Examples:
|
||
: AI model comparison {#tbl-models} (old format)
|
||
AI model comparison {#tbl-models} (new format)
|
||
Performance metrics {width=80% #tbl-performance}
|
||
**Special Function Units**: Details... {#tbl-sfu}
|
||
|
||
Args:
|
||
content: QMD file content
|
||
tbl_id: Full table ID (e.g., "tbl-models")
|
||
|
||
Returns:
|
||
Dict with 'caption', 'full_match' or None if not found
|
||
"""
|
||
# Try old format first (with leading colon) - this must be checked first to properly strip `: ` prefix
|
||
pattern_old = rf'^:\s*([^{{\n]+?)\s*\{{[^}}]*#{re.escape(tbl_id)}(?:\s|[^}}])*\}}\s*$'
|
||
match = re.search(pattern_old, content, re.MULTILINE)
|
||
|
||
if not match:
|
||
# Fall back to new format (without leading colon) - allow colons in caption text
|
||
pattern_new = rf'^([^{{\n]+?)\s*\{{[^}}]*#{re.escape(tbl_id)}(?:\s|[^}}])*\}}\s*$'
|
||
match = re.search(pattern_new, content, re.MULTILINE)
|
||
|
||
if match:
|
||
return {
|
||
'type': 'table',
|
||
'caption': match.group(1).strip(),
|
||
'full_match': match.group(0),
|
||
'start': match.start(),
|
||
'end': match.end()
|
||
}
|
||
return None
|
||
|
||
def find_table_definition_in_qmd(self, content: str, tbl_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Unified table detection using the specialized detect_table function.
|
||
|
||
Args:
|
||
content: QMD file content
|
||
tbl_id: Full table ID (e.g., "tbl-models")
|
||
|
||
Returns:
|
||
Dict with table information or None if not found
|
||
"""
|
||
return self.detect_table(content, tbl_id)
|
||
|
||
# ================================================================
|
||
# SPECIALIZED UPDATE FUNCTIONS FOR DIFFERENT FIGURE/TABLE TYPES
|
||
# ================================================================
|
||
|
||
def update_markdown_figure(self, content: str, fig_id: str, new_caption: str) -> str:
|
||
"""
|
||
Update caption in standard markdown figures.
|
||
|
||
Updates: {#fig-id} → {#fig-id}
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-ai-timeline")
|
||
new_caption: New caption text
|
||
|
||
Returns:
|
||
Updated content
|
||
"""
|
||
# Fixed pattern: Use .*? to handle nested brackets in captions (like citations)
|
||
pattern = rf'(!\[).*?(\]\([^)]+\)\s*\{{[^}}]*#{re.escape(fig_id)}(?:\s|[^}}])*\}})'
|
||
replacement = rf'\g<1>{new_caption}\g<2>'
|
||
return re.sub(pattern, replacement, content, flags=re.MULTILINE | re.DOTALL)
|
||
|
||
def update_tikz_figure(self, content: str, fig_id: str, new_caption: str) -> str:
|
||
"""
|
||
Update caption in TikZ/Div block figures.
|
||
|
||
Updates the caption text between the closing ``` and :::
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-neural-network")
|
||
new_caption: New caption text
|
||
|
||
Returns:
|
||
Updated content
|
||
"""
|
||
# Method 1: Replace caption after tikz block
|
||
# More flexible pattern: fig-id can be anywhere in the attributes
|
||
pattern = rf'(:::\s*\{{[^}}]*#{re.escape(fig_id)}(?:\s|[^}}])*\}}.*?```\s*\n\s*)([^:]+?)((?:\s*:::))'
|
||
|
||
def replace_caption(match):
|
||
before = match.group(1)
|
||
after = match.group(3)
|
||
return f"{before}{new_caption}{after}"
|
||
|
||
updated_content = re.sub(pattern, replace_caption, content, flags=re.MULTILINE | re.DOTALL)
|
||
|
||
# Method 2: If that didn't work, try simpler approach
|
||
if updated_content == content:
|
||
div_pattern = rf'(:::\s*\{{[^}}]*#{re.escape(fig_id)}(?:\s|[^}}])*\}}.*?```\s*\n)([^:]*?)(:::'
|
||
|
||
def replace_div_caption(match):
|
||
before = match.group(1)
|
||
after = match.group(3)
|
||
return f"{before}{new_caption}\n{after}"
|
||
|
||
updated_content = re.sub(div_pattern, replace_div_caption, content, flags=re.MULTILINE | re.DOTALL)
|
||
|
||
return updated_content
|
||
|
||
def update_code_figure(self, content: str, fig_id: str, new_caption: str) -> str:
|
||
"""
|
||
Update caption in code-generated figures (R/Python blocks).
|
||
|
||
Updates: #| fig-cap: "old caption" → #| fig-cap: "new caption"
|
||
For R figures: Ensures proper YAML quoting for captions starting with **
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-datacenter-energy")
|
||
new_caption: New caption text
|
||
|
||
Returns:
|
||
Updated content
|
||
"""
|
||
# Find the fig-cap line specifically for this figure's code block
|
||
pattern = rf'(```\{{(r|python)[^}}]*\}}[^`]*?#\|\s*label:\s*{re.escape(fig_id)}[^`]*?#\|\s*fig-cap:\s*)([^\n]+)'
|
||
|
||
def replace_fig_cap(match):
|
||
before = match.group(1)
|
||
language = match.group(2)
|
||
|
||
# For R figures: Ensure YAML-safe quoting (adds quotes if needed for ** captions)
|
||
# For other languages: Use standard quoting
|
||
if language == 'r':
|
||
yaml_safe_caption = self.ensure_yaml_safe_caption(new_caption)
|
||
return f'{before}{yaml_safe_caption}'
|
||
else:
|
||
return f'{before}"{new_caption}"'
|
||
|
||
return re.sub(pattern, replace_fig_cap, content, flags=re.MULTILINE | re.DOTALL)
|
||
|
||
def update_table_caption(self, content: str, tbl_id: str, new_caption: str) -> str:
|
||
"""
|
||
Update table captions using the simple, consistent format.
|
||
|
||
Format: `: [caption]. {#tbl-id [attributes]}`
|
||
|
||
Args:
|
||
content: QMD file content
|
||
tbl_id: Full table ID (e.g., "tbl-models")
|
||
new_caption: New caption text (may or may not have `: ` prefix)
|
||
|
||
Returns:
|
||
Updated content
|
||
"""
|
||
# Find the table caption line (with or without existing colon prefix)
|
||
# Match pattern: optional colon + caption + {#tbl-id ...}
|
||
pattern = rf'^:?\s*([^{{\n]+?)(\s*\{{[^}}]*#{re.escape(tbl_id)}(?:\s|[^}}])*\}})\s*$'
|
||
|
||
def replacement_func(match):
|
||
# Always use the simple format: `: new_caption {#tbl-id attributes}`
|
||
attributes = match.group(2) # Preserve the {#tbl-id ...} part
|
||
|
||
# Check if new_caption already has the `: ` prefix to avoid double colons
|
||
if new_caption.startswith(': '):
|
||
# New caption already has prefix, use as-is
|
||
formatted_caption = new_caption
|
||
else:
|
||
# Add the `: ` prefix and ensure it ends with a period
|
||
if not new_caption.endswith('.'):
|
||
formatted_caption = f': {new_caption}.'
|
||
else:
|
||
formatted_caption = f': {new_caption}'
|
||
|
||
return f'{formatted_caption} {attributes.strip()}'
|
||
|
||
updated_content = re.sub(pattern, replacement_func, content, flags=re.MULTILINE)
|
||
return updated_content
|
||
|
||
def update_table_caption_in_qmd(self, content: str, tbl_id: str, new_caption: str) -> str:
|
||
"""
|
||
Unified table caption update using the specialized update_table_caption function.
|
||
|
||
Args:
|
||
content: QMD file content
|
||
tbl_id: Full table ID (e.g., "tbl-models")
|
||
new_caption: New caption text
|
||
|
||
Returns:
|
||
Updated content
|
||
"""
|
||
return self.update_table_caption(content, tbl_id, new_caption)
|
||
|
||
# ================================================================
|
||
# UNIFIED UPDATE FUNCTION
|
||
# ================================================================
|
||
|
||
def update_figure_caption_in_qmd(self, content: str, fig_id: str, new_caption: str) -> str:
|
||
"""
|
||
Unified figure caption update across all supported formats.
|
||
|
||
Detects the figure type and calls the appropriate update function.
|
||
|
||
Args:
|
||
content: QMD file content
|
||
fig_id: Full figure ID (e.g., "fig-example")
|
||
new_caption: New caption text
|
||
|
||
Returns:
|
||
Updated content
|
||
"""
|
||
# First, determine what type of figure this is
|
||
fig_def = self.find_figure_definition_in_qmd(content, fig_id)
|
||
if not fig_def:
|
||
return content
|
||
|
||
# Route to appropriate update function based on type
|
||
if fig_def['type'] == 'markdown':
|
||
return self.update_markdown_figure(content, fig_id, new_caption)
|
||
elif fig_def['type'] == 'tikz':
|
||
return self.update_tikz_figure(content, fig_id, new_caption)
|
||
elif fig_def['type'] == 'code':
|
||
return self.update_code_figure(content, fig_id, new_caption)
|
||
else:
|
||
# Fallback to markdown method
|
||
return self.update_markdown_figure(content, fig_id, new_caption)
|
||
|
||
def print_summary(self) -> None:
|
||
"""Print a summary of the processing results."""
|
||
print(f"\n{'='*60}")
|
||
print(f"📊 CAPTION IMPROVEMENT SUMMARY")
|
||
print(f"{'='*60}")
|
||
|
||
print(f"Files processed: {self.stats['files_processed']}")
|
||
print(f"Figures found: {self.stats['figures_found']}")
|
||
print(f"Figures improved: {self.stats['figures_improved']} ✅")
|
||
print(f"Tables found: {self.stats['tables_found']}")
|
||
print(f"Tables improved: {self.stats['tables_improved']} ✅")
|
||
print(f"Images found: {self.stats['images_found']} 🖼️")
|
||
print(f"Images missing: {self.stats['images_missing']} ⚠️")
|
||
print(f"JSON success: {self.stats['json_success']} 📋")
|
||
print(f"JSON failed: {self.stats['json_failed']} 🚫")
|
||
|
||
if self.stats['figures_improved'] > 0:
|
||
improvement_rate = (self.stats['figures_improved'] / self.stats['figures_found']) * 100
|
||
print(f"Improvement rate: {improvement_rate:.1f}%")
|
||
|
||
if self.stats['errors']:
|
||
print(f"\n⚠️ Issues encountered ({len(self.stats['errors'])}):")
|
||
for i, error in enumerate(self.stats['errors'], 1):
|
||
print(f" {i}. {error}")
|
||
|
||
if self.stats['images_missing'] > 0:
|
||
print(f"\n💡 Tip: {self.stats['images_missing']} images were not found.")
|
||
print(f" Consider checking image paths or using text-only processing.")
|
||
|
||
if self.stats['tikz_found'] > 0:
|
||
print(f"\n🔧 TikZ Processing: Found {self.stats['tikz_found']} TikZ figures.")
|
||
if self.stats['tikz_failed'] > 0:
|
||
print(f" ⚠️ {self.stats['tikz_failed']} TikZ compilations failed.")
|
||
print(f" Ensure you have LaTeX (pdflatex) and ImageMagick (magick) installed.")
|
||
|
||
print(f"{'='*60}")
|
||
|
||
|
||
|
||
def validate_qmd_mapping(self, directories: List[str], content_map: Dict) -> Dict:
|
||
"""Scan QMD files and validate mapping for all figures/tables."""
|
||
print(f"🔍 Validating QMD mapping...")
|
||
|
||
# Check for commented chapters in target directories first
|
||
commented_issues = self.check_commented_chapters_in_directories(directories)
|
||
should_halt = self.print_commented_chapter_issues(commented_issues)
|
||
if should_halt:
|
||
return content_map
|
||
|
||
qmd_files = self.find_qmd_files_in_order(directories)
|
||
print(f"📁 Scanning {len(qmd_files)} QMD files")
|
||
|
||
# Track what we find in QMD files
|
||
found_figures = {}
|
||
found_tables = {}
|
||
missing_figures = []
|
||
missing_tables = []
|
||
|
||
# Scan all QMD files for figure/table references
|
||
for qmd_file in qmd_files:
|
||
try:
|
||
with open(qmd_file, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
print(f" 📄 Scanning: {qmd_file}")
|
||
|
||
# Check each figure from content map
|
||
for fig_id in content_map.get('figures', {}):
|
||
if fig_id not in found_figures:
|
||
fig_def = self.find_figure_definition_in_qmd(content, fig_id)
|
||
if fig_def:
|
||
found_figures[fig_id] = {
|
||
'qmd_file': qmd_file,
|
||
'current_caption': fig_def['caption'],
|
||
'definition': fig_def
|
||
}
|
||
print(f" ✅ Found figure: {fig_id}")
|
||
|
||
# Check each table from content map
|
||
for tbl_id in content_map.get('tables', {}):
|
||
if tbl_id not in found_tables:
|
||
tbl_def = self.find_table_definition_in_qmd(content, tbl_id)
|
||
if tbl_def:
|
||
found_tables[tbl_id] = {
|
||
'qmd_file': qmd_file,
|
||
'current_caption': tbl_def['caption'],
|
||
'definition': tbl_def
|
||
}
|
||
print(f" ✅ Found table: {tbl_id}")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error scanning {qmd_file}: {e}")
|
||
|
||
# Identify missing items
|
||
for fig_id in content_map.get('figures', {}):
|
||
if fig_id not in found_figures:
|
||
missing_figures.append(fig_id)
|
||
|
||
for tbl_id in content_map.get('tables', {}):
|
||
if tbl_id not in found_tables:
|
||
missing_tables.append(tbl_id)
|
||
|
||
# Update content map with QMD locations
|
||
for fig_id, fig_info in found_figures.items():
|
||
content_map['figures'][fig_id]['source_file'] = fig_info['qmd_file']
|
||
content_map['figures'][fig_id]['qmd_caption'] = fig_info['current_caption']
|
||
|
||
for tbl_id, tbl_info in found_tables.items():
|
||
content_map['tables'][tbl_id]['source_file'] = tbl_info['qmd_file']
|
||
content_map['tables'][tbl_id]['qmd_caption'] = tbl_info['current_caption']
|
||
|
||
# Report validation results
|
||
print(f"\n📊 VALIDATION SUMMARY:")
|
||
print(f" Figures: {len(found_figures)}/{len(content_map.get('figures', {}))} found in QMD")
|
||
print(f" Tables: {len(found_tables)}/{len(content_map.get('tables', {}))} found in QMD")
|
||
|
||
if missing_figures:
|
||
print(f"\n❌ Missing figures ({len(missing_figures)}):")
|
||
for fig_id in missing_figures:
|
||
print(f" - {fig_id}")
|
||
|
||
if missing_tables:
|
||
print(f"\n❌ Missing tables ({len(missing_tables)}):")
|
||
for tbl_id in missing_tables:
|
||
print(f" - {tbl_id}")
|
||
|
||
if not missing_figures and not missing_tables:
|
||
print(f"\n✅ Perfect mapping! All items found in QMD files.")
|
||
|
||
# Check already performed at start of validation
|
||
|
||
return content_map
|
||
|
||
def check_caption_quality(self, directories: List[str], figures_only: bool = False, tables_only: bool = False, listings_only: bool = False, content_map: Optional[Dict] = None) -> Dict[str, any]:
|
||
"""
|
||
Analyze all captions and generate quality report.
|
||
|
||
Args:
|
||
directories: List of directories to scan
|
||
figures_only: If True, only analyze figures (ignore tables and listings)
|
||
tables_only: If True, only analyze tables (ignore figures and listings)
|
||
listings_only: If True, only analyze listings (ignore figures and tables)
|
||
content_map: Optional pre-built content map to avoid rebuilding
|
||
|
||
Returns:
|
||
Dict with quality analysis results
|
||
"""
|
||
print("🔍 Analyzing caption quality...")
|
||
|
||
# Check for commented chapters in target directories first
|
||
commented_issues = self.check_commented_chapters_in_directories(directories)
|
||
should_halt = self.print_commented_chapter_issues(commented_issues)
|
||
if should_halt:
|
||
return {}
|
||
|
||
# Use provided content map or build fresh one with filtering
|
||
if content_map is None:
|
||
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
|
||
if not content_map:
|
||
print("❌ Failed to build content map for analysis.")
|
||
return {}
|
||
|
||
# Find all QMD files
|
||
qmd_files = self.find_qmd_files_in_order(directories)
|
||
|
||
report = {
|
||
'total_captions': 0,
|
||
'captions_needing_repair': 0,
|
||
'issues_by_type': {},
|
||
'detailed_issues': []
|
||
}
|
||
|
||
# Analyze figures
|
||
for fig_id, fig_data in content_map.get('figures', {}).items():
|
||
current_caption = fig_data.get('current_caption', '')
|
||
analysis = self.quality_checker.analyze_caption(current_caption)
|
||
|
||
report['total_captions'] += 1
|
||
if analysis['needs_repair']:
|
||
report['captions_needing_repair'] += 1
|
||
report['detailed_issues'].append({
|
||
'id': fig_id,
|
||
'type': 'figure',
|
||
'current_caption': current_caption,
|
||
'issues': analysis['issues']
|
||
})
|
||
|
||
# Count issues by type
|
||
for issue in analysis['issues']:
|
||
issue_type = issue['type']
|
||
report['issues_by_type'][issue_type] = report['issues_by_type'].get(issue_type, 0) + 1
|
||
|
||
# Analyze tables
|
||
for tbl_id, tbl_data in content_map.get('tables', {}).items():
|
||
current_caption = tbl_data.get('current_caption', '')
|
||
analysis = self.quality_checker.analyze_caption(current_caption)
|
||
|
||
report['total_captions'] += 1
|
||
if analysis['needs_repair']:
|
||
report['captions_needing_repair'] += 1
|
||
report['detailed_issues'].append({
|
||
'id': tbl_id,
|
||
'type': 'table',
|
||
'current_caption': current_caption,
|
||
'issues': analysis['issues']
|
||
})
|
||
|
||
# Count issues by type
|
||
for issue in analysis['issues']:
|
||
issue_type = issue['type']
|
||
report['issues_by_type'][issue_type] = report['issues_by_type'].get(issue_type, 0) + 1
|
||
|
||
return report
|
||
|
||
def print_quality_report(self, report: Dict[str, any]):
|
||
"""Print a nicely formatted quality report."""
|
||
if not report:
|
||
return
|
||
|
||
total = report['total_captions']
|
||
needing_repair = report['captions_needing_repair']
|
||
percentage = (needing_repair / total * 100) if total > 0 else 0
|
||
|
||
print(f"\n📊 Caption Quality Report:")
|
||
print(f"={'=' * 60}")
|
||
print(f"Total captions: {total}")
|
||
print(f"Need repair: {needing_repair} ({percentage:.1f}%)")
|
||
|
||
if report['issues_by_type']:
|
||
print(f"\n🔍 Issues by type:")
|
||
for issue_type, count in report['issues_by_type'].items():
|
||
print(f" • {issue_type.replace('_', ' ').title()}: {count}")
|
||
|
||
if report['detailed_issues']:
|
||
print(f"\n📝 Detailed Issues:")
|
||
print(f"┌─{'─' * 18}─┬─{'─' * 12}─┬─{'─' * 35}─┐")
|
||
print(f"│ {'ID':<18} │ {'Issue':<12} │ {'Current Caption':<35} │")
|
||
print(f"├─{'─' * 18}─┼─{'─' * 12}─┼─{'─' * 35}─┤")
|
||
|
||
for issue_item in report['detailed_issues'][:20]: # Limit to first 20
|
||
item_id = issue_item['id'][:18]
|
||
caption = issue_item['current_caption'][:35]
|
||
issues_desc = ', '.join([issue['description'] for issue in issue_item['issues']])[:12]
|
||
|
||
print(f"│ {item_id:<18} │ {issues_desc:<12} │ {caption:<35} │")
|
||
|
||
print(f"└─{'─' * 18}─┴─{'─' * 12}─┴─{'─' * 35}─┘")
|
||
|
||
if len(report['detailed_issues']) > 20:
|
||
print(f"... and {len(report['detailed_issues']) - 20} more issues")
|
||
|
||
if needing_repair > 0:
|
||
print(f"\n💡 To fix these issues, run:")
|
||
print(f" python {__file__} --repair -d {' -d '.join(['contents/vol1/'])}")
|
||
else:
|
||
print(f"\n✅ All captions look good!")
|
||
|
||
def repair_captions(self, directories: List[str], figures_only: bool = False, tables_only: bool = False, listings_only: bool = False, specific_files: List[str] = None):
|
||
"""
|
||
Repair only captions that need fixing.
|
||
|
||
Args:
|
||
directories: List of directories to scan
|
||
figures_only: If True, only repair figures (ignore tables and listings)
|
||
tables_only: If True, only repair tables (ignore figures and listings)
|
||
listings_only: If True, only repair listings (ignore figures and tables)
|
||
"""
|
||
print("🔧 Repairing captions that need fixing...")
|
||
|
||
# Check for commented chapters first
|
||
commented_issues = self.check_commented_chapters_in_directories(directories)
|
||
should_halt = self.print_commented_chapter_issues(commented_issues)
|
||
if should_halt:
|
||
return {}
|
||
|
||
# Build content map with filtering
|
||
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only, specific_files=specific_files)
|
||
|
||
# Check caption quality once using the existing content map
|
||
quality_report = self.check_caption_quality(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only, content_map=content_map)
|
||
|
||
# Create repair list - only items that need fixing
|
||
repair_items = []
|
||
for issue_item in quality_report['detailed_issues']:
|
||
item_id = issue_item['id']
|
||
item_type = issue_item['type']
|
||
|
||
if item_type == 'figure' and item_id in content_map.get('figures', {}):
|
||
repair_items.append(('figure', item_id, content_map['figures'][item_id]))
|
||
elif item_type == 'table' and item_id in content_map.get('tables', {}):
|
||
repair_items.append(('table', item_id, content_map['tables'][item_id]))
|
||
|
||
# If no repairs needed, exit early
|
||
if not repair_items:
|
||
print("✅ No captions need repair!")
|
||
return content_map
|
||
|
||
# Separate items by fix type and show preview
|
||
basic_fixes = []
|
||
llm_fixes = []
|
||
|
||
print(f"\n📋 Found {len(repair_items)} captions that need repair:")
|
||
print("=" * 60)
|
||
|
||
for item_type, item_id, item_data in repair_items:
|
||
# Check what type of issues this item has (from existing report)
|
||
issues = []
|
||
issue_descriptions = []
|
||
for issue_item in quality_report['detailed_issues']:
|
||
if issue_item['id'] == item_id:
|
||
issues = [issue['type'] for issue in issue_item['issues']]
|
||
issue_descriptions = [issue['description'] for issue in issue_item['issues']]
|
||
break
|
||
|
||
# Determine fix type and icon
|
||
if 'missing_bold_pattern' in issues:
|
||
llm_fixes.append((item_type, item_id, item_data))
|
||
icon = "🤖" # LLM fix needed
|
||
fix_type = "LLM regeneration"
|
||
else:
|
||
basic_fixes.append((item_type, item_id, item_data))
|
||
icon = "🔧" # Basic fix
|
||
fix_type = "Basic formatting"
|
||
|
||
# Show preview
|
||
type_icon = "📊" if item_type == "figure" else "📋"
|
||
source_file = item_data.get('source_file', 'unknown')
|
||
current_caption = item_data.get('current_caption', '')[:50] + "..." if len(item_data.get('current_caption', '')) > 50 else item_data.get('current_caption', '')
|
||
|
||
print(f"{icon} {type_icon} {item_id}")
|
||
print(f" File: {source_file}")
|
||
print(f" Current: {current_caption}")
|
||
print(f" Issues: {', '.join(issue_descriptions)}")
|
||
print(f" Fix: {fix_type}")
|
||
print()
|
||
|
||
print(f"🔧 Basic fixes: {len(basic_fixes)} items")
|
||
print(f"🤖 LLM fixes: {len(llm_fixes)} items")
|
||
print("=" * 60)
|
||
print("🚀 Starting repairs...\n")
|
||
|
||
fixed_count = 0
|
||
|
||
# Apply basic normalization fixes
|
||
for item_type, item_id, item_data in basic_fixes:
|
||
print(f"🔧 Processing {item_id}...", end="")
|
||
current_caption = item_data.get('current_caption', '')
|
||
|
||
# Apply normalization fixes
|
||
fixed_caption = self.normalize_caption_punctuation(current_caption)
|
||
fixed_caption = self.normalize_caption_case(fixed_caption)
|
||
|
||
# Only update if it actually changed
|
||
if fixed_caption != current_caption:
|
||
item_data['new_caption'] = fixed_caption
|
||
item_data['current_caption'] = fixed_caption # Update current_caption too
|
||
print(f" ✅")
|
||
fixed_count += 1
|
||
else:
|
||
print(f" 🔄 (no change needed)")
|
||
|
||
|
||
# Apply LLM fixes for missing **Bold**: format
|
||
if llm_fixes:
|
||
print(f"🤖 Generating properly formatted captions for {len(llm_fixes)} items...")
|
||
llm_fixed = self._apply_llm_repairs(llm_fixes, content_map)
|
||
fixed_count += llm_fixed
|
||
|
||
if fixed_count > 0:
|
||
# Save updated content map
|
||
self.save_content_map(content_map)
|
||
|
||
# Update QMD files
|
||
self.process_qmd_files(directories, content_map)
|
||
|
||
# Show summary of what was changed
|
||
print("\n" + "=" * 60)
|
||
print(f"✅ Successfully repaired {fixed_count} captions!")
|
||
print(f"🔧 Basic fixes: {len([f for f in basic_fixes if f[2].get('new_caption')])}")
|
||
print(f"🤖 LLM fixes: {len([f for f in llm_fixes if f[2].get('new_caption')])}")
|
||
print("💾 Content map saved to content_map.json")
|
||
print("📝 Check git diff to see what changed in the QMD files")
|
||
print("=" * 60)
|
||
else:
|
||
print("\n✅ No automatic repairs needed!")
|
||
|
||
def _apply_llm_repairs(self, llm_fixes, content_map):
|
||
"""Apply LLM-based repairs for captions missing **Bold**: format."""
|
||
fixed_count = 0
|
||
|
||
for item_type, item_id, item_data in llm_fixes:
|
||
print(f"🤖 Generating caption for {item_id}...", end="")
|
||
try:
|
||
source_file = item_data.get('source_file')
|
||
if not source_file:
|
||
print(f" ❌ (no source file)")
|
||
continue
|
||
|
||
# Read file content for context extraction
|
||
with open(source_file, 'r', encoding='utf-8') as f:
|
||
file_content = f.read()
|
||
|
||
# Extract context around this item
|
||
context = self.extract_section_context(file_content, item_id)
|
||
current_caption = item_data.get('current_caption', '')
|
||
|
||
# Generate improved caption with appropriate parameters
|
||
if item_type == 'figure':
|
||
# Check if it's a markdown figure for image path
|
||
image_path = None
|
||
if item_data.get('type') == 'markdown':
|
||
image_pattern = rf'!\[[^\]]*\]\(([^)]+)\)[^{{]*{{[^}}]*#{re.escape(item_id)}'
|
||
match = re.search(image_pattern, file_content)
|
||
if match:
|
||
relative_path = match.group(1)
|
||
source_dir = Path(source_file).parent
|
||
image_path = str(source_dir / relative_path)
|
||
if not os.path.exists(image_path):
|
||
image_path = None
|
||
|
||
new_caption = self.generate_caption_with_ollama(
|
||
context['title'], context['content'], item_id, current_caption,
|
||
image_path, is_table=False
|
||
)
|
||
|
||
elif item_type == 'table':
|
||
new_caption = self.generate_caption_with_ollama(
|
||
context['title'], context['content'], item_id, current_caption,
|
||
None, is_table=True
|
||
)
|
||
|
||
elif item_type == 'listing':
|
||
# Get code content for listings
|
||
code_content = ""
|
||
lst_def = self.find_listing_definition_in_qmd(file_content, item_id)
|
||
if lst_def:
|
||
code_pattern = r'```[^`]*```'
|
||
code_match = re.search(code_pattern, lst_def['full_text'], re.DOTALL)
|
||
if code_match:
|
||
code_content = code_match.group(0)
|
||
|
||
new_caption = self.generate_caption_with_ollama(
|
||
context['title'], context['content'], item_id, current_caption,
|
||
None, is_table=False, is_listing=True, code_content=code_content
|
||
)
|
||
else:
|
||
continue
|
||
|
||
if new_caption and new_caption != current_caption:
|
||
item_data['new_caption'] = new_caption
|
||
item_data['current_caption'] = new_caption
|
||
word_count = len(new_caption.split())
|
||
print(f" ✅ ({word_count} words)")
|
||
fixed_count += 1
|
||
else:
|
||
print(f" 🔄 (no improvement)")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ (error: {str(e)[:50]}...)")
|
||
|
||
return fixed_count
|
||
|
||
# ================================================================
|
||
# QMD-FOCUSED CONTENT MAP BUILDING
|
||
# ================================================================
|
||
|
||
def build_content_map_from_qmd(self, directories: List[str], figures_only: bool = False, tables_only: bool = False, listings_only: bool = False, specific_files: List[str] = None) -> Dict:
|
||
"""
|
||
Build comprehensive content map by scanning QMD files directly.
|
||
|
||
This QMD-focused approach:
|
||
1. Scans all .qmd files in specified directories
|
||
2. Uses specialized detection functions for each format type
|
||
3. Extracts current captions and metadata
|
||
4. Stores everything in a clean JSON structure
|
||
5. Independent of .tex builds or rendering
|
||
|
||
Args:
|
||
directories: List of directories to scan for .qmd files
|
||
figures_only: If True, only process figures (ignore tables and listings)
|
||
tables_only: If True, only process tables (ignore figures and listings)
|
||
listings_only: If True, only process listings (ignore figures and tables)
|
||
specific_files: List of specific files to include (optional)
|
||
|
||
Returns:
|
||
Dict with figures, tables, metadata, and extraction stats
|
||
"""
|
||
print(f"📄 Building content map from QMD files...")
|
||
|
||
# Check for commented chapters first (only if processing directories, not specific files)
|
||
if not specific_files and directories:
|
||
commented_issues = self.check_commented_chapters_in_directories(directories)
|
||
should_halt = self.print_commented_chapter_issues(commented_issues)
|
||
if should_halt:
|
||
return {}
|
||
|
||
# Get QMD files - use specific files if provided, otherwise scan directories
|
||
if specific_files:
|
||
qmd_files = [Path(f) for f in specific_files]
|
||
print(f"📖 Processing {len(qmd_files)} specific QMD files")
|
||
else:
|
||
qmd_files = self.find_qmd_files_in_order(directories)
|
||
print(f"📖 Scanning {len(qmd_files)} QMD files in book order")
|
||
|
||
content_map = {
|
||
'figures': {},
|
||
'tables': {},
|
||
'listings': {},
|
||
'metadata': {
|
||
'creation_time': datetime.now().isoformat(),
|
||
'source': 'qmd_direct_scan',
|
||
'directories': directories,
|
||
'qmd_files_scanned': len(qmd_files),
|
||
'extraction_stats': {
|
||
'figures_found': 0,
|
||
'tables_found': 0,
|
||
'listings_found': 0,
|
||
'markdown_figures': 0,
|
||
'tikz_figures': 0,
|
||
'r_figures': 0,
|
||
'code_figures': 0,
|
||
'extraction_failures': 0,
|
||
'failed_extractions': [],
|
||
'files_with_issues': []
|
||
}
|
||
}
|
||
}
|
||
|
||
stats = content_map['metadata']['extraction_stats']
|
||
|
||
for qmd_file in qmd_files:
|
||
try:
|
||
print(f" 📄 Scanning: {qmd_file}")
|
||
|
||
with open(qmd_file, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
file_figures = 0
|
||
file_tables = 0
|
||
file_listings = 0
|
||
|
||
# Find all potential figure IDs in the content using regex
|
||
# Pattern matches both {#fig-id} and #| label: fig-id formats
|
||
fig_id_pattern = r'(?:#\|\s*label:\s*(fig-[a-zA-Z0-9_-]+)|#(fig-[a-zA-Z0-9_-]+))'
|
||
matches = re.findall(fig_id_pattern, content)
|
||
# Extract non-empty matches (either group 1 or group 2)
|
||
potential_fig_ids = set()
|
||
for match in matches:
|
||
fig_id = match[0] or match[1] # Get non-empty group
|
||
if fig_id:
|
||
potential_fig_ids.add(fig_id)
|
||
|
||
# Find all potential table IDs
|
||
tbl_id_pattern = r'#(tbl-[a-zA-Z0-9_-]+)'
|
||
potential_tbl_ids = set(re.findall(tbl_id_pattern, content))
|
||
|
||
# Find all potential listing IDs
|
||
lst_id_pattern = r'#(lst-[a-zA-Z0-9_-]+)'
|
||
potential_lst_ids = set(re.findall(lst_id_pattern, content))
|
||
|
||
# Process each potential figure ID (unless tables-only or listings-only mode)
|
||
if not tables_only and not listings_only:
|
||
for fig_id in potential_fig_ids:
|
||
try:
|
||
fig_def = self.find_figure_definition_in_qmd(content, fig_id)
|
||
if fig_def:
|
||
# Store original caption as-is from the file
|
||
original_caption = fig_def['caption']
|
||
|
||
content_map['figures'][fig_id] = {
|
||
'original_caption': original_caption,
|
||
'current_caption': original_caption, # Set current_caption for quality analysis
|
||
'new_caption': '',
|
||
'type': fig_def['type'],
|
||
'source_file': qmd_file
|
||
}
|
||
|
||
print(f" ✅ Found figure: {fig_id} ({fig_def['type']})")
|
||
file_figures += 1
|
||
stats['figures_found'] += 1
|
||
|
||
# Count by type
|
||
if fig_def['type'] == 'markdown':
|
||
stats['markdown_figures'] += 1
|
||
elif fig_def['type'] == 'tikz':
|
||
stats['tikz_figures'] += 1
|
||
elif fig_def['type'] == 'r':
|
||
stats['r_figures'] += 1
|
||
elif fig_def['type'] == 'code':
|
||
# Distinguish R figures from other code figures
|
||
if fig_def.get('language') == 'r':
|
||
stats['r_figures'] += 1
|
||
else:
|
||
stats['code_figures'] += 1
|
||
|
||
else:
|
||
print(f" ⚠️ Failed to extract: {fig_id}")
|
||
stats['extraction_failures'] += 1
|
||
stats['failed_extractions'].append(fig_id)
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing {fig_id}: {e}")
|
||
stats['extraction_failures'] += 1
|
||
stats['failed_extractions'].append(fig_id)
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
# Skip messages only shown in general mode (not when using type filters)
|
||
# else:
|
||
# if tables_only:
|
||
# print(f" ⏭️ Skipping {len(potential_fig_ids)} figures (tables-only mode)")
|
||
# else: # listings_only
|
||
# print(f" ⏭️ Skipping {len(potential_fig_ids)} figures (listings-only mode)")
|
||
|
||
# Process each potential table ID (unless figures-only or listings-only mode)
|
||
if not figures_only and not listings_only:
|
||
for tbl_id in potential_tbl_ids:
|
||
try:
|
||
tbl_def = self.detect_table(content, tbl_id)
|
||
if tbl_def:
|
||
# Store original caption as-is from the file
|
||
original_caption = tbl_def['caption']
|
||
|
||
content_map['tables'][tbl_id] = {
|
||
'original_caption': original_caption,
|
||
'current_caption': original_caption, # Set current_caption for quality analysis
|
||
'new_caption': '',
|
||
'type': 'table',
|
||
'source_file': qmd_file
|
||
}
|
||
|
||
print(f" ✅ Found table: {tbl_id}")
|
||
file_tables += 1
|
||
stats['tables_found'] += 1
|
||
|
||
else:
|
||
print(f" ⚠️ Failed to extract: {tbl_id}")
|
||
stats['extraction_failures'] += 1
|
||
stats['failed_extractions'].append(tbl_id)
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing {tbl_id}: {e}")
|
||
stats['extraction_failures'] += 1
|
||
stats['failed_extractions'].append(tbl_id)
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
# Skip messages only shown in general mode (not when using type filters)
|
||
# else:
|
||
# if figures_only:
|
||
# print(f" ⏭️ Skipping {len(potential_tbl_ids)} tables (figures-only mode)")
|
||
# else: # listings_only
|
||
# print(f" ⏭️ Skipping {len(potential_tbl_ids)} tables (listings-only mode)")
|
||
|
||
# Process each potential listing ID (unless figures-only or tables-only mode, or if explicitly listings-only)
|
||
if (not figures_only and not tables_only) or listings_only:
|
||
for lst_id in potential_lst_ids:
|
||
try:
|
||
lst_def = self.find_listing_definition_enhanced(content, lst_id)
|
||
if lst_def:
|
||
# Store original caption as-is from the file
|
||
original_caption = lst_def['caption']
|
||
|
||
content_map['listings'][lst_id] = {
|
||
'original_caption': original_caption,
|
||
'current_caption': original_caption, # Set current_caption for quality analysis
|
||
'new_caption': '',
|
||
'type': 'listing',
|
||
'language': lst_def.get('language', ''),
|
||
'source_file': qmd_file,
|
||
'instances': lst_def.get('instances', []), # Store for atomic updates
|
||
'consistency_warnings': lst_def.get('consistency_warnings', [])
|
||
}
|
||
|
||
print(f" ✅ Found listing: {lst_id} ({lst_def.get('language', 'unknown')})")
|
||
file_listings += 1
|
||
stats['listings_found'] += 1
|
||
|
||
else:
|
||
print(f" ⚠️ Failed to extract: {lst_id}")
|
||
stats['extraction_failures'] += 1
|
||
stats['failed_extractions'].append(lst_id)
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing {lst_id}: {e}")
|
||
stats['extraction_failures'] += 1
|
||
stats['failed_extractions'].append(lst_id)
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
# Skip messages only shown in general mode (not when using type filters)
|
||
# else:
|
||
# if figures_only:
|
||
# print(f" ⏭️ Skipping {len(potential_lst_ids)} listings (figures-only mode)")
|
||
# else: # tables_only
|
||
# print(f" ⏭️ Skipping {len(potential_lst_ids)} listings (tables-only mode)")
|
||
|
||
# Summary for this file
|
||
if file_figures > 0 or file_tables > 0 or file_listings > 0:
|
||
print(f" 📊 File summary: {file_figures} figures, {file_tables} tables, {file_listings} listings")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error reading {qmd_file}: {e}")
|
||
stats['extraction_failures'] += 1
|
||
if qmd_file not in stats['files_with_issues']:
|
||
stats['files_with_issues'].append(qmd_file)
|
||
|
||
# Final summary - only show relevant types based on filter mode
|
||
print(f"\n📊 QMD EXTRACTION SUMMARY:")
|
||
|
||
# Show figures info only if processing figures
|
||
if not tables_only and not listings_only:
|
||
print(f" 📊 Figures: {stats['figures_found']} found")
|
||
print(f" • Markdown: {stats['markdown_figures']}")
|
||
print(f" • TikZ: {stats['tikz_figures']}")
|
||
print(f" • R: {stats['r_figures']}")
|
||
print(f" • Code: {stats['code_figures']}")
|
||
|
||
# Show tables info only if processing tables
|
||
if not figures_only and not listings_only:
|
||
print(f" 📋 Tables: {stats['tables_found']} found")
|
||
|
||
# Show listings info only if processing listings
|
||
if not figures_only and not tables_only:
|
||
print(f" 📝 Listings: {stats['listings_found']} found")
|
||
|
||
print(f" ⚠️ Extraction failures: {stats['extraction_failures']}")
|
||
|
||
# Show specific failed extractions
|
||
if stats['extraction_failures'] > 0 and 'failed_extractions' in stats:
|
||
print(f" 📋 Failed extractions:")
|
||
for failed_id in stats['failed_extractions']:
|
||
print(f" • {failed_id}")
|
||
|
||
if stats['files_with_issues']:
|
||
print(f" 📁 Files with issues: {len(stats['files_with_issues'])}")
|
||
for file in stats['files_with_issues'][:5]: # Show first 5
|
||
print(f" • {file}")
|
||
if len(stats['files_with_issues']) > 5:
|
||
print(f" • ... and {len(stats['files_with_issues']) - 5} more")
|
||
|
||
# Calculate success rate based on what we're processing
|
||
processed_count = 0
|
||
if not tables_only and not listings_only:
|
||
processed_count += stats['figures_found']
|
||
if not figures_only and not listings_only:
|
||
processed_count += stats['tables_found']
|
||
if not figures_only and not tables_only:
|
||
processed_count += stats['listings_found']
|
||
|
||
total_ids = processed_count + stats['extraction_failures']
|
||
success_rate = processed_count / total_ids * 100 if total_ids > 0 else 0
|
||
print(f" ✅ Success rate: {success_rate:.1f}%")
|
||
|
||
return content_map
|
||
|
||
def process_qmd_files(self, directories: List[str], content_map: Optional[Dict] = None):
|
||
"""
|
||
Process QMD files to update captions using targeted search-and-replace.
|
||
|
||
Uses individual search-and-replace operations for each caption change
|
||
to preserve file integrity, encoding, and formatting.
|
||
|
||
Args:
|
||
directories: List of directories to process
|
||
content_map: Content map with figures and tables data (optional, will build if None)
|
||
"""
|
||
print("📝 Processing QMD files for caption updates...")
|
||
|
||
# Build content map if not provided
|
||
if content_map is None:
|
||
print("📄 Building content map from QMD files...")
|
||
content_map = self.build_content_map_from_qmd(directories)
|
||
if not content_map:
|
||
print("❌ Failed to build content map")
|
||
return
|
||
|
||
# Collect all items that need updates
|
||
updates_to_apply = []
|
||
|
||
# Collect figures that need updates
|
||
for fig_id, fig_data in content_map.get('figures', {}).items():
|
||
if 'new_caption' in fig_data and fig_data.get('new_caption'):
|
||
source_file = fig_data.get('source_file')
|
||
original_caption = fig_data.get('original_caption', '')
|
||
new_caption = fig_data.get('new_caption', '')
|
||
|
||
if source_file and original_caption and new_caption:
|
||
updates_to_apply.append({
|
||
'file': source_file,
|
||
'id': fig_id,
|
||
'type': 'figure',
|
||
'original_caption': original_caption,
|
||
'new_caption': new_caption
|
||
})
|
||
|
||
# Collect tables that need updates
|
||
for tbl_id, tbl_data in content_map.get('tables', {}).items():
|
||
if 'new_caption' in tbl_data and tbl_data.get('new_caption'):
|
||
source_file = tbl_data.get('source_file')
|
||
original_caption = tbl_data.get('original_caption', '')
|
||
new_caption = tbl_data.get('new_caption', '')
|
||
|
||
if source_file and original_caption and new_caption:
|
||
updates_to_apply.append({
|
||
'file': source_file,
|
||
'id': tbl_id,
|
||
'type': 'table',
|
||
'original_caption': original_caption,
|
||
'new_caption': new_caption
|
||
})
|
||
|
||
# Collect listings that need updates
|
||
for lst_id, lst_data in content_map.get('listings', {}).items():
|
||
if 'new_caption' in lst_data and lst_data.get('new_caption'):
|
||
source_file = lst_data.get('source_file')
|
||
original_caption = lst_data.get('original_caption', '')
|
||
new_caption = lst_data.get('new_caption', '')
|
||
|
||
if source_file and original_caption and new_caption:
|
||
updates_to_apply.append({
|
||
'file': source_file,
|
||
'id': lst_id,
|
||
'type': 'listing',
|
||
'original_caption': original_caption,
|
||
'new_caption': new_caption
|
||
})
|
||
|
||
if not updates_to_apply:
|
||
print("ℹ️ No caption updates needed (no new_caption entries found)")
|
||
return
|
||
|
||
# Apply targeted search-and-replace for each update
|
||
total_successful = 0
|
||
total_failed = 0
|
||
|
||
for update in updates_to_apply:
|
||
try:
|
||
file_path = update['file']
|
||
item_id = update['id']
|
||
item_type = update['type']
|
||
original_caption = update['original_caption']
|
||
new_caption = update['new_caption']
|
||
|
||
print(f"📄 Updating {item_id} in {file_path}")
|
||
|
||
# Create targeted search pattern for this specific caption
|
||
success = self.apply_targeted_caption_update(
|
||
file_path, item_id, item_type, original_caption, new_caption
|
||
)
|
||
|
||
if success:
|
||
total_successful += 1
|
||
print(f" ✅ Updated: {item_id}")
|
||
else:
|
||
total_failed += 1
|
||
print(f" ❌ Failed: {item_id} (pattern not found)")
|
||
|
||
except Exception as e:
|
||
total_failed += 1
|
||
print(f" ❌ Error updating {update['id']}: {e}")
|
||
|
||
print(f"📊 Summary: {total_successful} successful updates, {total_failed} failed")
|
||
|
||
def apply_targeted_caption_update(self, file_path: str, item_id: str, item_type: str,
|
||
original_caption: str, new_caption: str) -> bool:
|
||
"""
|
||
Apply a single targeted caption update using precise search-and-replace.
|
||
Enhanced to handle multiple instances for listings using atomic updates.
|
||
|
||
Args:
|
||
file_path: Path to the QMD file
|
||
item_id: Figure or table ID (e.g., "fig-example", "tbl-data", "lst-code")
|
||
item_type: "figure", "table", or "listing"
|
||
original_caption: Current caption text to find
|
||
new_caption: New caption text to replace with
|
||
|
||
Returns:
|
||
True if update was successful, False otherwise
|
||
"""
|
||
try:
|
||
# Read current file content
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# Use enhanced atomic update system for listings
|
||
if item_type == 'listing':
|
||
# Detect all instances of this listing
|
||
instances = self.detect_all_instances_by_id(content, item_id, "listing")
|
||
|
||
if not instances:
|
||
print(f" ⚠️ No instances found for {item_id}")
|
||
return False
|
||
|
||
# Verify captions match what we expect
|
||
canonical_caption, warnings = self.canonicalize_caption(instances)
|
||
if canonical_caption != original_caption:
|
||
print(f" ⚠️ Caption mismatch for {item_id}: expected '{original_caption}', found '{canonical_caption}'")
|
||
# Try to proceed anyway if close
|
||
if original_caption.strip() not in canonical_caption and canonical_caption.strip() not in original_caption:
|
||
return False
|
||
|
||
# Apply atomic update to all instances
|
||
updated_content, success, errors = self.update_all_instances_atomically(
|
||
content, item_id, instances, new_caption
|
||
)
|
||
|
||
if not success:
|
||
for error in errors:
|
||
print(f" ❌ {error}")
|
||
return False
|
||
|
||
# Write updated content
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(updated_content)
|
||
|
||
# Success message with instance count
|
||
print(f" ✅ Updated {len(instances)} instances atomically")
|
||
return True
|
||
|
||
# Fall back to original logic for figures and tables
|
||
if item_type == 'figure':
|
||
old_pattern, new_pattern = self.build_figure_search_patterns(
|
||
item_id, original_caption, new_caption, content
|
||
)
|
||
elif item_type == 'table':
|
||
old_pattern, new_pattern = self.build_table_search_patterns(
|
||
item_id, original_caption, new_caption, content
|
||
)
|
||
else:
|
||
return False
|
||
|
||
if not old_pattern:
|
||
return False
|
||
|
||
# Verify the pattern exists exactly once
|
||
if content.count(old_pattern) != 1:
|
||
print(f" ⚠️ Pattern occurs {content.count(old_pattern)} times (expected 1)")
|
||
return False
|
||
|
||
# Apply the replacement
|
||
new_content = content.replace(old_pattern, new_pattern)
|
||
|
||
# Write back the file
|
||
with open(file_path, 'w', encoding='utf-8') as f:
|
||
f.write(new_content)
|
||
|
||
return True
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error in targeted update: {e}")
|
||
return False
|
||
|
||
def build_figure_search_patterns(self, fig_id: str, original_caption: str,
|
||
new_caption: str, content: str) -> Tuple[str, str]:
|
||
"""
|
||
Build precise search patterns for figure caption replacement.
|
||
|
||
Returns:
|
||
Tuple of (old_pattern, new_pattern) or (None, None) if not found
|
||
"""
|
||
# Try different figure formats in order of specificity
|
||
|
||
# 1. Markdown figure: {#fig-id}
|
||
# Use smart escaping that preserves parentheses and other normal caption characters
|
||
escaped_caption = self.escape_caption_for_regex(original_caption)
|
||
markdown_pattern = rf'!\[{escaped_caption}\](\([^)]+\)\s*\{{[^}}]*#{re.escape(fig_id)}[^}}]*\}})'
|
||
if re.search(markdown_pattern, content):
|
||
old_pattern = re.search(markdown_pattern, content).group(0)
|
||
new_pattern = f'![{new_caption}]' + re.search(markdown_pattern, content).group(1)
|
||
return old_pattern, new_pattern
|
||
|
||
# 2. TikZ figure: look for caption line in div block
|
||
escaped_caption = self.escape_caption_for_regex(original_caption)
|
||
tikz_div_pattern = rf'(:::\s*\{{[^}}]*#{re.escape(fig_id)}[^}}]*\}}.*?```\s*\n\s*){escaped_caption}(\s*)(:::)'
|
||
match = re.search(tikz_div_pattern, content, re.DOTALL)
|
||
if match:
|
||
old_pattern = match.group(0)
|
||
# Ensure there's a line break before the closing :::
|
||
line_break = match.group(2) if match.group(2) else '\n'
|
||
new_pattern = match.group(1) + new_caption + line_break + match.group(3)
|
||
return old_pattern, new_pattern
|
||
|
||
# 3. Code figure: #| fig-cap: "caption" (with R figure YAML-safe handling)
|
||
# Check if this is an R figure by looking at the code block context
|
||
r_code_pattern = rf'```\{{r[^}}]*\}}[^`]*?#\|\s*label:\s*{re.escape(fig_id)}[^`]*?#\|\s*fig-cap:\s*([^\n]+)'
|
||
r_match = re.search(r_code_pattern, content, re.DOTALL | re.MULTILINE)
|
||
if r_match:
|
||
# This is an R figure - handle YAML-safe extraction and updating
|
||
yaml_value = r_match.group(1).strip()
|
||
clean_caption = self.extract_caption_from_yaml_value(yaml_value)
|
||
if clean_caption == original_caption:
|
||
old_pattern = r_match.group(0)
|
||
# Ensure new caption is YAML-safe for R figures
|
||
yaml_safe_new_caption = self.ensure_yaml_safe_caption(new_caption)
|
||
# Replace just the caption part
|
||
prefix = old_pattern[:old_pattern.rfind(yaml_value)]
|
||
new_pattern = prefix + yaml_safe_new_caption
|
||
return old_pattern, new_pattern
|
||
|
||
# Fallback for non-R code figures
|
||
escaped_caption = self.escape_caption_for_regex(original_caption)
|
||
code_pattern = rf'(#\|\s*fig-cap:\s*["\']?){escaped_caption}(["\']?)'
|
||
if re.search(code_pattern, content):
|
||
old_pattern = re.search(code_pattern, content).group(0)
|
||
new_pattern = re.search(code_pattern, content).group(1) + new_caption + re.search(code_pattern, content).group(2)
|
||
return old_pattern, new_pattern
|
||
|
||
return None, None
|
||
|
||
def build_table_search_patterns(self, tbl_id: str, original_caption: str,
|
||
new_caption: str, content: str) -> Tuple[str, str]:
|
||
"""
|
||
Build simple search patterns for table caption replacement.
|
||
|
||
Always produces the simple format: `: [caption]. {#tbl-id [attributes]}`
|
||
|
||
Returns:
|
||
Tuple of (old_pattern, new_pattern) or (None, None) if not found
|
||
"""
|
||
# Simple approach: Find any line with the table ID and caption
|
||
# Handles both `: caption {#tbl-id}` and `caption {#tbl-id}` formats
|
||
escaped_caption = self.escape_caption_for_regex(original_caption)
|
||
pattern = rf'^:?\s*{escaped_caption}(\s*\{{[^}}]*#{re.escape(tbl_id)}[^}}]*\}})(.*)$'
|
||
|
||
match = re.search(pattern, content, re.MULTILINE)
|
||
if match:
|
||
old_pattern = match.group(0)
|
||
attributes = match.group(1) # The {#tbl-id ...} part
|
||
trailing = match.group(2) # Anything after the }
|
||
|
||
# Ensure new_caption ends with a period but avoid double periods
|
||
if not new_caption.endswith('.'):
|
||
new_caption = new_caption + '.'
|
||
|
||
# Always output the simple format: `: new_caption {#tbl-id [attributes]}`
|
||
new_pattern = f': {new_caption} {attributes.strip()}{trailing}'
|
||
|
||
return old_pattern, new_pattern
|
||
|
||
return None, None
|
||
|
||
def build_listing_search_patterns(self, lst_id: str, original_caption: str,
|
||
new_caption: str, content: str) -> Tuple[str, str]:
|
||
"""
|
||
Build precise search patterns for listing caption replacement.
|
||
|
||
Handles two patterns:
|
||
1. Traditional: #| lst-cap: "caption" or #| lst-cap: caption
|
||
2. HTML callout: title="caption"
|
||
|
||
Returns:
|
||
Tuple of (old_pattern, new_pattern) or (None, None) if not found
|
||
"""
|
||
|
||
# Method 1: Traditional lst-cap pattern
|
||
# Try quoted version first
|
||
quoted_pattern = rf'(#\|\s*lst-cap:\s*"){re.escape(original_caption)}(")'
|
||
match = re.search(quoted_pattern, content)
|
||
if match:
|
||
old_pattern = match.group(0)
|
||
new_pattern = match.group(1) + new_caption + match.group(2)
|
||
return old_pattern, new_pattern
|
||
|
||
# Try unquoted version
|
||
unquoted_pattern = rf'(#\|\s*lst-cap:\s*){re.escape(original_caption)}(\s*$)'
|
||
match = re.search(unquoted_pattern, content, re.MULTILINE)
|
||
if match:
|
||
old_pattern = match.group(0)
|
||
new_pattern = match.group(1) + new_caption + match.group(2)
|
||
return old_pattern, new_pattern
|
||
|
||
# Method 2: HTML callout title pattern
|
||
# Pattern: title="original_caption" within a div containing lst-id
|
||
title_pattern = rf'(:::\s*\{{[^}}]*#{re.escape(lst_id)}[^}}]*title\s*=\s*"){re.escape(original_caption)}("[^}}]*\}})'
|
||
match = re.search(title_pattern, content)
|
||
if match:
|
||
old_pattern = match.group(0)
|
||
new_pattern = match.group(1) + new_caption + match.group(2)
|
||
return old_pattern, new_pattern
|
||
|
||
return None, None
|
||
|
||
def improve_captions_with_llm(self, directories: List[str], content_map: Optional[Dict] = None,
|
||
figures_only: bool = False, tables_only: bool = False, listings_only: bool = False):
|
||
"""Improve captions using LLM and immediately update each file after processing."""
|
||
print("🤖 Improving captions with LLM...")
|
||
|
||
# Build content map if not provided
|
||
if content_map is None:
|
||
print("📄 Building content map from QMD files...")
|
||
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
|
||
if not content_map:
|
||
print("❌ Failed to build content map")
|
||
return {}
|
||
|
||
total_figures = len(content_map.get('figures', {}))
|
||
total_tables = len(content_map.get('tables', {}))
|
||
total_listings = len(content_map.get('listings', {}))
|
||
|
||
if total_figures == 0 and total_tables == 0 and total_listings == 0:
|
||
print("❌ No figures, tables, or listings found in content map")
|
||
return content_map
|
||
|
||
# Show processing message with only relevant types
|
||
processing_parts = []
|
||
if not tables_only and not listings_only and total_figures > 0:
|
||
processing_parts.append(f"{total_figures} figures")
|
||
if not figures_only and not listings_only and total_tables > 0:
|
||
processing_parts.append(f"{total_tables} tables")
|
||
if not figures_only and not tables_only and total_listings > 0:
|
||
processing_parts.append(f"{total_listings} listings")
|
||
|
||
if processing_parts:
|
||
print(f"📊 Processing: {', '.join(processing_parts)}")
|
||
else:
|
||
print("📊 No items to process")
|
||
|
||
# Group items by source file for efficient processing
|
||
files_to_process = {}
|
||
|
||
# Group figures by file
|
||
for fig_id, fig_data in content_map.get('figures', {}).items():
|
||
source_file = fig_data.get('source_file')
|
||
if source_file:
|
||
if source_file not in files_to_process:
|
||
files_to_process[source_file] = {'figures': [], 'tables': [], 'listings': []}
|
||
files_to_process[source_file]['figures'].append((fig_id, fig_data))
|
||
|
||
# Group tables by file
|
||
for tbl_id, tbl_data in content_map.get('tables', {}).items():
|
||
source_file = tbl_data.get('source_file')
|
||
if source_file:
|
||
if source_file not in files_to_process:
|
||
files_to_process[source_file] = {'figures': [], 'tables': [], 'listings': []}
|
||
files_to_process[source_file]['tables'].append((tbl_id, tbl_data))
|
||
|
||
# Group listings by file
|
||
for lst_id, lst_data in content_map.get('listings', {}).items():
|
||
source_file = lst_data.get('source_file')
|
||
if source_file:
|
||
if source_file not in files_to_process:
|
||
files_to_process[source_file] = {'figures': [], 'tables': [], 'listings': []}
|
||
files_to_process[source_file]['listings'].append((lst_id, lst_data))
|
||
|
||
total_improved = 0
|
||
files_updated = 0
|
||
|
||
# Process each file independently
|
||
for source_file, items in files_to_process.items():
|
||
print(f"\n📄 Processing file: {source_file}")
|
||
|
||
try:
|
||
# Read file content once for this file
|
||
with open(source_file, 'r', encoding='utf-8') as f:
|
||
file_content = f.read()
|
||
|
||
file_improvements = []
|
||
file_improved_count = 0
|
||
|
||
# Process all figures in this file
|
||
for fig_id, fig_data in items['figures']:
|
||
print(f" 📊 Processing figure: {fig_id}")
|
||
|
||
try:
|
||
# Extract context around this figure
|
||
context = self.extract_section_context(file_content, fig_id)
|
||
|
||
# Find image path if it's a markdown figure
|
||
image_path = None
|
||
if fig_data.get('type') == 'markdown':
|
||
# Try to extract image path from the figure definition
|
||
image_pattern = rf'!\[[^\]]*\]\(([^)]+)\)[^{{]*{{[^}}]*#{re.escape(fig_id)}'
|
||
match = re.search(image_pattern, file_content)
|
||
if match:
|
||
relative_path = match.group(1)
|
||
# Resolve relative to the source file directory
|
||
source_dir = Path(source_file).parent
|
||
image_path = str(source_dir / relative_path)
|
||
if not os.path.exists(image_path):
|
||
image_path = None
|
||
|
||
# Generate improved caption
|
||
current_caption = fig_data.get('original_caption', '')
|
||
new_caption = self.generate_caption_with_ollama(
|
||
context['title'],
|
||
context['content'],
|
||
fig_id,
|
||
current_caption,
|
||
image_path,
|
||
is_table=False
|
||
)
|
||
|
||
if new_caption and new_caption != current_caption:
|
||
fig_data['new_caption'] = new_caption
|
||
file_improvements.append({
|
||
'id': fig_id,
|
||
'type': 'figure',
|
||
'original': current_caption,
|
||
'new': new_caption
|
||
})
|
||
file_improved_count += 1
|
||
word_count = len(new_caption.split())
|
||
print(f" ✅ Improved ({word_count} words): {new_caption[:120]}{'...' if len(new_caption) > 120 else ''}")
|
||
else:
|
||
print(f" ⚠️ No improvement generated")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing {fig_id}: {e}")
|
||
|
||
# Process all tables in this file
|
||
for tbl_id, tbl_data in items['tables']:
|
||
print(f" 📋 Processing table: {tbl_id}")
|
||
|
||
try:
|
||
# Extract context around this table
|
||
context = self.extract_section_context(file_content, tbl_id)
|
||
|
||
# Generate improved caption (no image for tables)
|
||
current_caption = tbl_data.get('original_caption', '')
|
||
new_caption = self.generate_caption_with_ollama(
|
||
context['title'],
|
||
context['content'],
|
||
tbl_id,
|
||
current_caption,
|
||
None, # No image for tables
|
||
is_table=True
|
||
)
|
||
|
||
if new_caption and new_caption != current_caption:
|
||
tbl_data['new_caption'] = new_caption
|
||
file_improvements.append({
|
||
'id': tbl_id,
|
||
'type': 'table',
|
||
'original': current_caption,
|
||
'new': new_caption
|
||
})
|
||
file_improved_count += 1
|
||
word_count = len(new_caption.split())
|
||
print(f" ✅ Improved ({word_count} words): {new_caption[:120]}{'...' if len(new_caption) > 120 else ''}")
|
||
else:
|
||
print(f" ⚠️ No improvement generated")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing {tbl_id}: {e}")
|
||
|
||
# Process all listings in this file
|
||
for lst_id, lst_data in items['listings']:
|
||
print(f" 📝 Processing listing: {lst_id}")
|
||
|
||
try:
|
||
# Extract context around this listing
|
||
context = self.extract_section_context(file_content, lst_id)
|
||
|
||
# Get the actual code content from the listing
|
||
code_content = ""
|
||
lst_def = self.find_listing_definition_in_qmd(file_content, lst_id)
|
||
if lst_def:
|
||
# Extract code from the full_text
|
||
code_pattern = r'```[^`]*```'
|
||
code_match = re.search(code_pattern, lst_def['full_text'], re.DOTALL)
|
||
if code_match:
|
||
code_content = code_match.group(0)
|
||
|
||
# Generate improved caption with code content
|
||
current_caption = lst_data.get('original_caption', '')
|
||
new_caption = self.generate_caption_with_ollama(
|
||
context['title'],
|
||
context['content'],
|
||
lst_id,
|
||
current_caption,
|
||
None, # No image for listings
|
||
is_table=False,
|
||
is_listing=True,
|
||
code_content=code_content
|
||
)
|
||
|
||
if new_caption and new_caption != current_caption:
|
||
lst_data['new_caption'] = new_caption
|
||
file_improvements.append({
|
||
'id': lst_id,
|
||
'type': 'listing',
|
||
'original': current_caption,
|
||
'new': new_caption
|
||
})
|
||
file_improved_count += 1
|
||
word_count = len(new_caption.split())
|
||
print(f" ✅ Improved ({word_count} words): {new_caption[:120]}{'...' if len(new_caption) > 120 else ''}")
|
||
else:
|
||
print(f" ⚠️ No improvement generated")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing {lst_id}: {e}")
|
||
|
||
# Immediately update this file if we have improvements
|
||
if file_improvements:
|
||
print(f" ✏️ Updating file with {file_improved_count} improvements...")
|
||
self.update_single_file_captions(source_file, file_improvements)
|
||
files_updated += 1
|
||
print(f" ✅ File updated successfully!")
|
||
else:
|
||
print(f" ℹ️ No improvements for this file - skipping update")
|
||
|
||
total_improved += file_improved_count
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error processing file {source_file}: {e}")
|
||
|
||
print(f"\n🎉 LLM improvement complete!")
|
||
print(f" 📊 Total captions improved: {total_improved}")
|
||
print(f" 📁 Files updated: {files_updated}")
|
||
return content_map
|
||
|
||
def update_single_file_captions(self, file_path: str, improvements: List[Dict]):
|
||
"""
|
||
Update a single QMD file with improved captions using targeted search-and-replace.
|
||
|
||
Args:
|
||
file_path: Path to the QMD file to update
|
||
improvements: List of dicts with 'id', 'type', 'original', 'new' keys
|
||
"""
|
||
if not improvements:
|
||
return
|
||
|
||
success_count = 0
|
||
|
||
for improvement in improvements:
|
||
item_id = improvement['id']
|
||
item_type = improvement['type']
|
||
original_caption = improvement['original']
|
||
new_caption = improvement['new']
|
||
|
||
try:
|
||
success = self.apply_targeted_caption_update(
|
||
file_path, item_id, item_type, original_caption, new_caption
|
||
)
|
||
|
||
if success:
|
||
success_count += 1
|
||
print(f" ✅ Updated {item_id}")
|
||
else:
|
||
print(f" ⚠️ Failed to update {item_id}")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Error updating {item_id}: {e}")
|
||
|
||
print(f" 📊 Successfully updated {success_count}/{len(improvements)} items in {file_path}")
|
||
|
||
def complete_caption_improvement_workflow(self, directories: List[str], save_json: bool = False, figures_only: bool = False, tables_only: bool = False, listings_only: bool = False):
|
||
"""
|
||
Complete LLM caption improvement process (used by --improve and default mode).
|
||
|
||
Process: Extract → Analyze → Improve with LLM → Update files → Validate
|
||
|
||
Args:
|
||
directories: List of directories to process
|
||
save_json: Whether to save detailed JSON output
|
||
figures_only: If True, only process figures (ignore tables and listings)
|
||
tables_only: If True, only process tables (ignore figures and listings)
|
||
listings_only: If True, only process listings (ignore figures and tables)
|
||
"""
|
||
print("🚀 Starting complete caption improvement workflow...")
|
||
|
||
# Check for commented chapters first
|
||
commented_issues = self.check_commented_chapters_in_directories(directories)
|
||
should_halt = self.print_commented_chapter_issues(commented_issues)
|
||
if should_halt:
|
||
return {}
|
||
|
||
# Step 1: Build content map with filtering
|
||
print("📄 Step 1: Building content map...")
|
||
content_map = self.build_content_map_from_qmd(directories, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
|
||
if not content_map:
|
||
print("❌ Failed to build content map")
|
||
return {}
|
||
|
||
total_items = len(content_map.get('figures', {})) + len(content_map.get('tables', {}))
|
||
print(f"✅ Found {total_items} items to process")
|
||
|
||
# Optional: Save JSON for inspection
|
||
if save_json:
|
||
self.save_content_map(content_map)
|
||
print("💾 Content map saved to content_map.json for inspection")
|
||
|
||
# Step 2: Improve captions using LLM (files updated immediately during processing)
|
||
print("\n🤖 Step 2: Improving captions with LLM...")
|
||
improved_content_map = self.improve_captions_with_llm(directories, content_map, figures_only=figures_only, tables_only=tables_only, listings_only=listings_only)
|
||
|
||
# Count improvements
|
||
improved_count = 0
|
||
for fig_data in improved_content_map.get('figures', {}).values():
|
||
if fig_data.get('new_caption'):
|
||
improved_count += 1
|
||
for tbl_data in improved_content_map.get('tables', {}).values():
|
||
if tbl_data.get('new_caption'):
|
||
improved_count += 1
|
||
|
||
if improved_count == 0:
|
||
print("⚠️ No captions were improved. Workflow complete.")
|
||
return improved_content_map
|
||
|
||
print(f"✅ {improved_count} captions improved and files updated")
|
||
|
||
# Step 3: Save improvements summary to JSON file
|
||
print("\n💾 Step 3: Saving improvements summary...")
|
||
improvements_file = self.save_improvements_summary(improved_content_map, directories, improved_count, total_items)
|
||
|
||
print("\n🎉 LLM caption improvement completed successfully!")
|
||
print(f"📊 Total items processed: {total_items}")
|
||
print(f"📝 Items improved: {improved_count}")
|
||
print(f"📁 Directories: {', '.join(directories)}")
|
||
print(f"📄 Improvements saved to: {improvements_file}")
|
||
|
||
return improved_content_map
|
||
|
||
def save_improvements_summary(self, content_map: Dict, directories: List[str], improved_count: int, total_items: int) -> str:
|
||
"""
|
||
Save a comprehensive summary of caption improvements to a JSON file.
|
||
|
||
Args:
|
||
content_map: Content map with original and improved captions
|
||
directories: Directories processed
|
||
improved_count: Number of items improved
|
||
total_items: Total items processed
|
||
|
||
Returns:
|
||
Path to the saved improvements file
|
||
"""
|
||
from datetime import datetime
|
||
|
||
# Create improvements summary
|
||
improvements = {
|
||
'metadata': {
|
||
'timestamp': datetime.now().isoformat(),
|
||
'directories_processed': directories,
|
||
'total_items': total_items,
|
||
'items_improved': improved_count,
|
||
'success_rate': f"{(improved_count/total_items*100):.1f}%" if total_items > 0 else "0%",
|
||
'workflow': 'complete_caption_improvement_workflow',
|
||
'method': 'pypandoc_ast_context_extraction'
|
||
},
|
||
'improvements': {
|
||
'figures': {},
|
||
'tables': {}
|
||
},
|
||
'summary': {
|
||
'figures_improved': 0,
|
||
'tables_improved': 0,
|
||
'no_change': 0
|
||
}
|
||
}
|
||
|
||
# Process figures
|
||
for fig_id, fig_data in content_map.get('figures', {}).items():
|
||
original = fig_data.get('original_caption', '')
|
||
improved = fig_data.get('new_caption', '')
|
||
|
||
improvement_entry = {
|
||
'id': fig_id,
|
||
'type': fig_data.get('type', 'unknown'),
|
||
'source_file': fig_data.get('source_file', ''),
|
||
'original_caption': original,
|
||
'improved_caption': improved,
|
||
'status': 'improved' if improved and improved != original else 'no_change'
|
||
}
|
||
|
||
improvements['improvements']['figures'][fig_id] = improvement_entry
|
||
|
||
if improved and improved != original:
|
||
improvements['summary']['figures_improved'] += 1
|
||
else:
|
||
improvements['summary']['no_change'] += 1
|
||
|
||
# Process tables
|
||
for tbl_id, tbl_data in content_map.get('tables', {}).items():
|
||
original = tbl_data.get('original_caption', '')
|
||
improved = tbl_data.get('new_caption', '')
|
||
|
||
improvement_entry = {
|
||
'id': tbl_id,
|
||
'type': 'table',
|
||
'source_file': tbl_data.get('source_file', ''),
|
||
'original_caption': original,
|
||
'improved_caption': improved,
|
||
'status': 'improved' if improved and improved != original else 'no_change'
|
||
}
|
||
|
||
improvements['improvements']['tables'][tbl_id] = improvement_entry
|
||
|
||
if improved and improved != original:
|
||
improvements['summary']['tables_improved'] += 1
|
||
else:
|
||
improvements['summary']['no_change'] += 1
|
||
|
||
# Generate filename with timestamp
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
filename = f"caption_improvements_{timestamp}.json"
|
||
|
||
# Convert any Path objects to strings for JSON serialization
|
||
|
||
# Save to JSON file
|
||
try:
|
||
serializable_improvements = convert_paths_to_strings(improvements)
|
||
with open(filename, 'w', encoding='utf-8') as f:
|
||
json.dump(serializable_improvements, f, indent=2, ensure_ascii=False)
|
||
|
||
print(f"📄 Improvements summary saved to: {filename}")
|
||
print(f"📊 Summary: {improvements['summary']['figures_improved']} figures + {improvements['summary']['tables_improved']} tables improved")
|
||
|
||
return filename
|
||
|
||
except Exception as e:
|
||
print(f"❌ Error saving improvements summary: {e}")
|
||
return ""
|
||
|
||
def extract_references_with_pypandoc(self, content: str, qmd_file_path: str) -> Dict[str, Dict]:
|
||
"""
|
||
Use pypandoc to systematically extract figure/table references and their context.
|
||
|
||
This approach:
|
||
1. Parses QMD to structured AST using pypandoc
|
||
2. Finds all @fig-id and @tbl-id references in paragraphs
|
||
3. Extracts surrounding paragraph context systematically
|
||
4. Maps references to their #fig-id definitions
|
||
|
||
Returns:
|
||
Dict mapping figure_id -> {
|
||
'reference_context': paragraph context around @fig-id,
|
||
'definition_info': info about #fig-id definition,
|
||
'section_title': section containing the reference
|
||
}
|
||
"""
|
||
try:
|
||
# Convert QMD to JSON AST using pypandoc
|
||
ast_json = pypandoc.convert_text(
|
||
content,
|
||
'json',
|
||
format='markdown+smart',
|
||
extra_args=['--preserve-tabs']
|
||
)
|
||
ast = json.loads(ast_json)
|
||
|
||
references_map = {}
|
||
|
||
# Walk the AST to find references and context
|
||
def walk_ast(element, section_title="Unknown Section", paragraph_context=None):
|
||
if isinstance(element, dict):
|
||
element_type = element.get('t', '')
|
||
|
||
# Track section headers
|
||
if element_type == 'Header':
|
||
level = element.get('c', [None, None, []])[0]
|
||
if level == 2: # ## headers
|
||
inlines = element.get('c', [None, None, []])[2]
|
||
section_title = self._extract_text_from_inlines(inlines)
|
||
|
||
# Process paragraphs to find cross-references
|
||
elif element_type == 'Para':
|
||
para_content = element.get('c', [])
|
||
para_text = self._extract_text_from_inlines(para_content)
|
||
|
||
# Find figure/table references in this paragraph
|
||
fig_refs = re.findall(r'@(fig-[a-zA-Z0-9_-]+)', para_text)
|
||
tbl_refs = re.findall(r'@(tbl-[a-zA-Z0-9_-]+)', para_text)
|
||
|
||
for ref_id in fig_refs + tbl_refs:
|
||
if ref_id not in references_map:
|
||
# Get definition info
|
||
def_info = self.find_figure_definition_in_qmd(content, ref_id) or \
|
||
self.find_table_definition_in_qmd(content, ref_id)
|
||
|
||
references_map[ref_id] = {
|
||
'reference_paragraph': para_text,
|
||
'section_title': section_title,
|
||
'definition_info': def_info,
|
||
'file_path': qmd_file_path
|
||
}
|
||
|
||
# Recursively process content
|
||
if 'c' in element:
|
||
if isinstance(element['c'], list):
|
||
for item in element['c']:
|
||
walk_ast(item, section_title, paragraph_context)
|
||
else:
|
||
walk_ast(element['c'], section_title, paragraph_context)
|
||
|
||
elif isinstance(element, list):
|
||
for item in element:
|
||
walk_ast(item, section_title, paragraph_context)
|
||
|
||
# Start walking from the document blocks
|
||
blocks = ast.get('blocks', [])
|
||
walk_ast(blocks)
|
||
|
||
# Now get adjacent paragraph context for each reference
|
||
for ref_id, ref_data in references_map.items():
|
||
context = self._get_adjacent_paragraphs_from_ast(ast, ref_data['reference_paragraph'])
|
||
ref_data['context_paragraphs'] = context
|
||
|
||
return references_map
|
||
|
||
except Exception as e:
|
||
print(f"⚠️ pypandoc parsing failed: {e}")
|
||
print(f" Falling back to regex-based approach")
|
||
return {}
|
||
|
||
def _extract_text_from_inlines(self, inlines: List) -> str:
|
||
"""Extract plain text from pypandoc inline elements."""
|
||
text_parts = []
|
||
|
||
def extract_from_element(element):
|
||
if isinstance(element, dict):
|
||
element_type = element.get('t', '')
|
||
if element_type == 'Str':
|
||
return element.get('c', '')
|
||
elif element_type == 'Space':
|
||
return ' '
|
||
elif element_type in ['Emph', 'Strong', 'Code']:
|
||
# Extract text from emphasized/strong/code content
|
||
content = element.get('c', [])
|
||
if isinstance(content, list):
|
||
return ''.join(extract_from_element(item) for item in content)
|
||
return str(content)
|
||
elif element_type == 'Link':
|
||
# Extract text from link content (first element of c)
|
||
link_content = element.get('c', [[], '', []])[0]
|
||
return ''.join(extract_from_element(item) for item in link_content)
|
||
# Handle other inline types as needed
|
||
elif 'c' in element:
|
||
content = element['c']
|
||
if isinstance(content, list):
|
||
return ''.join(extract_from_element(item) for item in content)
|
||
return str(content)
|
||
elif isinstance(element, str):
|
||
return element
|
||
return ''
|
||
|
||
for inline in inlines:
|
||
text_parts.append(extract_from_element(inline))
|
||
|
||
return ''.join(text_parts)
|
||
|
||
def _get_adjacent_paragraphs_from_ast(self, ast: Dict, target_paragraph: str) -> List[str]:
|
||
"""
|
||
Find the target paragraph in AST and return [previous, current, next] paragraphs.
|
||
"""
|
||
blocks = ast.get('blocks', [])
|
||
paragraphs = []
|
||
|
||
# Extract all paragraph texts
|
||
def collect_paragraphs(elements):
|
||
for element in elements:
|
||
if isinstance(element, dict) and element.get('t') == 'Para':
|
||
para_text = self._extract_text_from_inlines(element.get('c', []))
|
||
paragraphs.append(para_text)
|
||
elif isinstance(element, dict) and 'c' in element:
|
||
if isinstance(element['c'], list):
|
||
collect_paragraphs(element['c'])
|
||
|
||
collect_paragraphs(blocks)
|
||
|
||
# Find target paragraph and get context
|
||
target_idx = None
|
||
for i, para in enumerate(paragraphs):
|
||
if target_paragraph in para or para in target_paragraph:
|
||
target_idx = i
|
||
break
|
||
|
||
if target_idx is None:
|
||
return [target_paragraph] # Fallback
|
||
|
||
context_paragraphs = []
|
||
|
||
# Previous paragraph
|
||
if target_idx > 0:
|
||
context_paragraphs.append(paragraphs[target_idx - 1])
|
||
|
||
# Current paragraph
|
||
context_paragraphs.append(paragraphs[target_idx])
|
||
|
||
# Next paragraph
|
||
if target_idx + 1 < len(paragraphs):
|
||
context_paragraphs.append(paragraphs[target_idx + 1])
|
||
|
||
return context_paragraphs
|
||
|
||
def check_ollama_and_model(self, model_name: str) -> bool:
|
||
"""
|
||
Check if Ollama is running and if the specified model is available.
|
||
If model doesn't exist, automatically pull it.
|
||
|
||
Returns:
|
||
True if model is ready to use, False if there are issues
|
||
"""
|
||
print(f"🔍 Checking Ollama and model: {model_name}")
|
||
|
||
try:
|
||
# Check if Ollama is running
|
||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
||
if response.status_code != 200:
|
||
print("❌ Ollama server not responding. Please start Ollama:")
|
||
print(" brew services start ollama")
|
||
print(" # or")
|
||
print(" ollama serve")
|
||
return False
|
||
|
||
# Get list of available models
|
||
models_data = response.json()
|
||
available_models = [model['name'] for model in models_data.get('models', [])]
|
||
|
||
print(f"📦 Available models: {len(available_models)} found")
|
||
|
||
# Check if our model is available
|
||
if model_name in available_models:
|
||
print(f"✅ Model {model_name} is ready!")
|
||
return True
|
||
|
||
# Model not found - provide instructions instead of auto-pulling
|
||
print(f"❌ Model {model_name} not found locally.")
|
||
print("💡 To use this model, please run:")
|
||
print(f" ollama pull {model_name}")
|
||
print()
|
||
print("📋 Available models:")
|
||
for model in available_models[:5]: # Show first 5
|
||
print(f" • {model}")
|
||
if len(available_models) > 5:
|
||
print(f" ... and {len(available_models) - 5} more")
|
||
print()
|
||
print("🔧 To see all models: ollama list")
|
||
return False
|
||
|
||
except requests.exceptions.ConnectionError:
|
||
print("❌ Cannot connect to Ollama. Please ensure Ollama is installed and running:")
|
||
print(" 1. Install: curl -fsSL https://ollama.ai/install.sh | sh")
|
||
print(" 2. Start: ollama serve")
|
||
print(f" 3. Pull model: ollama pull {model_name}")
|
||
return False
|
||
except requests.exceptions.Timeout:
|
||
print("⏰ Timeout while pulling model. Large models can take 10+ minutes.")
|
||
print("💡 Try running manually in another terminal:")
|
||
print(f" ollama pull {model_name}")
|
||
return False
|
||
except Exception as e:
|
||
print(f"❌ Unexpected error checking Ollama: {e}")
|
||
return False
|
||
|
||
def list_available_models(self) -> bool:
|
||
"""
|
||
List all available Ollama models.
|
||
|
||
Returns:
|
||
True if successful, False if there are issues
|
||
"""
|
||
try:
|
||
response = requests.get("http://localhost:11434/api/tags", timeout=5)
|
||
if response.status_code != 200:
|
||
print("❌ Ollama server not responding. Please start Ollama.")
|
||
return False
|
||
|
||
models_data = response.json()
|
||
models = models_data.get('models', [])
|
||
|
||
if not models:
|
||
print("📦 No models found. Popular models to try:")
|
||
print(" ollama pull qwen2.5:7b # Fast, good quality")
|
||
print(" ollama pull llama3.2:3b # Smaller, faster")
|
||
print(" ollama pull qwen2.5:14b # Larger, better quality")
|
||
print(" ollama pull mistral:7b # Alternative option")
|
||
return True
|
||
|
||
print(f"📦 Available Ollama Models ({len(models)} found):")
|
||
print("=" * 60)
|
||
|
||
# Sort models by size for better display
|
||
sorted_models = sorted(models, key=lambda x: x.get('size', 0))
|
||
|
||
for model in sorted_models:
|
||
name = model.get('name', 'Unknown')
|
||
size = model.get('size', 0)
|
||
size_gb = size / (1024**3) if size > 0 else 0
|
||
modified = model.get('modified_at', '')
|
||
|
||
# Format size nicely
|
||
if size_gb >= 1:
|
||
size_str = f"{size_gb:.1f}GB"
|
||
else:
|
||
size_mb = size / (1024**2)
|
||
size_str = f"{size_mb:.0f}MB"
|
||
|
||
# Format date
|
||
if modified:
|
||
try:
|
||
from datetime import datetime
|
||
dt = datetime.fromisoformat(modified.replace('Z', '+00:00'))
|
||
date_str = dt.strftime('%Y-%m-%d')
|
||
except:
|
||
date_str = modified[:10]
|
||
else:
|
||
date_str = "Unknown"
|
||
|
||
print(f" 📊 {name:<25} │ {size_str:>8} │ {date_str}")
|
||
|
||
print("=" * 60)
|
||
print("💡 Usage: python improve_figure_captions.py -d contents/vol1/ --model MODEL_NAME")
|
||
return True
|
||
|
||
except requests.exceptions.ConnectionError:
|
||
print("❌ Cannot connect to Ollama. Please ensure Ollama is running.")
|
||
return False
|
||
except Exception as e:
|
||
print(f"❌ Error listing models: {e}")
|
||
return False
|
||
|
||
def extract_section_context(self, content: str, figure_or_table_id: str) -> Dict[str, str]:
|
||
"""Extract context around a figure - now uses pypandoc AST when possible."""
|
||
# Try pypandoc approach first
|
||
try:
|
||
references = self.extract_references_with_pypandoc(content, "temp_file")
|
||
if figure_or_table_id in references:
|
||
ref_data = references[figure_or_table_id]
|
||
context_paragraphs = ref_data.get('context_paragraphs', [])
|
||
|
||
return {
|
||
'title': ref_data.get('section_title', 'Unknown Section'),
|
||
'content': '\n\n'.join(context_paragraphs)
|
||
}
|
||
except Exception as e:
|
||
print(f" ⚠️ pypandoc approach failed: {e}")
|
||
|
||
# Fallback to original paragraph-based approach
|
||
return self.extract_paragraph_context(content, figure_or_table_id)
|
||
|
||
def detect_code_listing(self, content, lst_id):
|
||
"""
|
||
Detect code listings with lst-label and lst-cap OR HTML callout with title.
|
||
|
||
Handles two patterns:
|
||
1. Traditional: #| lst-label: lst-id + #| lst-cap: Caption
|
||
2. HTML callout: ::: {#lst-id .callout-important title="Caption"}
|
||
|
||
Args:
|
||
content (str): File content
|
||
lst_id (str): Listing ID to search for (e.g., 'lst-mlp_layer_matrix')
|
||
|
||
Returns:
|
||
dict: Listing definition with caption, full_text, type='listing', language
|
||
"""
|
||
|
||
# Method 1: Traditional pattern with #| lst-label: and #| lst-cap:
|
||
# Pattern for code listings: #| lst-label: lst-id
|
||
label_pattern = rf'#\|\s*lst-label:\s*{re.escape(lst_id)}'
|
||
|
||
# Find the code block containing this label
|
||
code_block_pattern = r'```\{([^}]+)\}(.*?)```'
|
||
|
||
for block_match in re.finditer(code_block_pattern, content, re.DOTALL):
|
||
code_block = block_match.group(0)
|
||
language = block_match.group(1).strip()
|
||
|
||
# Check if this block contains our listing label
|
||
if not re.search(label_pattern, code_block):
|
||
continue
|
||
|
||
# Extract lst-cap from the code block (handle both quoted and unquoted)
|
||
# Generic regex based on actual codebase patterns:
|
||
# - Quoted: #| lst-cap: "Caption text"
|
||
# - Unquoted: #| lst-cap: Caption text
|
||
cap_pattern = r'#\|\s*lst-cap:\s*(?:"([^"]*)"|(.+))$'
|
||
cap_match = re.search(cap_pattern, code_block, re.MULTILINE)
|
||
if cap_match:
|
||
# Group 1: quoted caption, Group 2: unquoted caption
|
||
caption = (cap_match.group(1) or cap_match.group(2) or '').strip()
|
||
else:
|
||
caption = ""
|
||
|
||
result = {
|
||
'caption': caption,
|
||
'full_text': code_block,
|
||
'type': 'listing',
|
||
'language': language,
|
||
'pattern': 'traditional'
|
||
}
|
||
return result
|
||
|
||
# Method 2: HTML callout pattern with title="Caption"
|
||
# Pattern: ::: {#lst-id .callout-important title="Caption"} ... code block ... :::
|
||
callout_pattern = rf':::\s*\{{[^}}]*#{re.escape(lst_id)}[^}}]*title\s*=\s*"([^"]*)"[^}}]*\}}(.*?):::'
|
||
|
||
callout_match = re.search(callout_pattern, content, re.DOTALL)
|
||
if callout_match:
|
||
caption = callout_match.group(1).strip()
|
||
callout_content = callout_match.group(2)
|
||
full_callout = callout_match.group(0)
|
||
|
||
# Extract language from the code block within the callout
|
||
code_in_callout = re.search(r'```\{\.?([^}]+)\}', callout_content)
|
||
language = code_in_callout.group(1).strip() if code_in_callout else 'unknown'
|
||
|
||
result = {
|
||
'caption': caption,
|
||
'full_text': full_callout,
|
||
'type': 'listing',
|
||
'language': language,
|
||
'pattern': 'callout'
|
||
}
|
||
return result
|
||
|
||
return None
|
||
|
||
def find_listing_definition_in_qmd(self, content, lst_id):
|
||
"""
|
||
Find a listing definition in QMD content.
|
||
|
||
Args:
|
||
content (str): QMD file content
|
||
lst_id (str): Listing ID to search for (e.g., 'lst-mlp_layer_matrix')
|
||
|
||
Returns:
|
||
dict: Listing definition with type, caption, full_text, language
|
||
"""
|
||
# Only one detector for code listings
|
||
detectors = [
|
||
self.detect_code_listing,
|
||
]
|
||
|
||
for detector in detectors:
|
||
result = detector(content, lst_id)
|
||
if result:
|
||
return result
|
||
|
||
return None
|
||
|
||
# ================================================================
|
||
# GENERIC MULTI-INSTANCE DETECTION SYSTEM
|
||
# ================================================================
|
||
|
||
def detect_all_instances_by_id(self, content: str, item_id: str, item_type: str) -> List[Dict]:
|
||
"""
|
||
Generic method to find ALL instances of any ID (fig-, tbl-, lst-) in content.
|
||
|
||
This handles the common scenario where the same ID appears multiple times
|
||
due to HTML/PDF conditional rendering (e.g., content-visible blocks).
|
||
|
||
Args:
|
||
content: QMD file content
|
||
item_id: Full ID (e.g., "fig-example", "tbl-data", "lst-code")
|
||
item_type: Type hint ("figure", "table", "listing")
|
||
|
||
Returns:
|
||
List of all instances with their captions, formats, and locations
|
||
"""
|
||
instances = []
|
||
|
||
if item_type == "listing":
|
||
# Pattern 1: HTML callout format
|
||
# ::: {#lst-id .callout-important title="Caption"}
|
||
callout_pattern = rf':::\s*\{{[^}}]*#{re.escape(item_id)}[^}}]*title=["\']([^"\']*)["\'][^}}]*\}}'
|
||
for match in re.finditer(callout_pattern, content, re.DOTALL):
|
||
instances.append({
|
||
'format': 'html_callout',
|
||
'caption': match.group(1).strip(),
|
||
'full_text': match.group(0),
|
||
'start': match.start(),
|
||
'end': match.end(),
|
||
'pattern_type': 'callout'
|
||
})
|
||
|
||
# Pattern 2: PDF code block format
|
||
# #| lst-label: lst-id
|
||
# #| lst-cap: Caption (or "Caption")
|
||
label_pattern = rf'#\|\s*lst-label:\s*{re.escape(item_id)}'
|
||
|
||
# Find all code blocks that contain this label
|
||
code_blocks = re.finditer(r'```\{([^}]+)\}(.*?)```', content, re.DOTALL)
|
||
for code_match in code_blocks:
|
||
code_content = code_match.group(2)
|
||
if re.search(label_pattern, code_content):
|
||
# Look for lst-cap in this code block
|
||
cap_match = re.search(r'#\|\s*lst-cap:\s*(?:["\']([^"\']*)["\']|([^\n]*))', code_content)
|
||
if cap_match:
|
||
caption = (cap_match.group(1) or cap_match.group(2) or '').strip()
|
||
language = code_match.group(1).strip()
|
||
|
||
instances.append({
|
||
'format': 'pdf_code',
|
||
'caption': caption,
|
||
'full_text': code_match.group(0),
|
||
'start': code_match.start(),
|
||
'end': code_match.end(),
|
||
'pattern_type': 'code_block',
|
||
'language': language
|
||
})
|
||
|
||
elif item_type == "figure":
|
||
# TODO: Add figure detection patterns (markdown, tikz, code)
|
||
# For now, fall back to existing detection
|
||
pass
|
||
|
||
elif item_type == "table":
|
||
# TODO: Add table detection patterns
|
||
# For now, fall back to existing detection
|
||
pass
|
||
|
||
return instances
|
||
|
||
|
||
|
||
def update_all_instances_atomically(self, content: str, item_id: str,
|
||
instances: List[Dict], new_caption: str) -> Tuple[str, bool, List[str]]:
|
||
"""
|
||
Update ALL instances of an ID atomically - success/fail as a unit.
|
||
|
||
Args:
|
||
content: Original file content
|
||
item_id: ID to update
|
||
instances: List of instances from detect_all_instances_by_id
|
||
new_caption: New caption text
|
||
|
||
Returns:
|
||
Tuple of (updated_content, success, error_messages)
|
||
"""
|
||
if not instances:
|
||
return content, False, ["No instances to update"]
|
||
|
||
updated_content = content
|
||
error_messages = []
|
||
|
||
# Process instances in reverse order (by position) to maintain string positions
|
||
sorted_instances = sorted(instances, key=lambda x: x['start'], reverse=True)
|
||
|
||
for instance in sorted_instances:
|
||
try:
|
||
if instance['format'] == 'html_callout':
|
||
# Update: title="old caption" -> title="new caption"
|
||
pattern = rf'(:::\s*\{{[^}}]*#{re.escape(item_id)}[^}}]*title=["\'])([^"\']*?)(["\'][^}}]*\}})'
|
||
replacement = rf'\1{new_caption}\3'
|
||
|
||
new_content = re.sub(pattern, replacement, updated_content, count=1)
|
||
if new_content == updated_content:
|
||
error_messages.append(f"Failed to update HTML callout for {item_id}")
|
||
return content, False, error_messages
|
||
updated_content = new_content
|
||
|
||
elif instance['format'] == 'pdf_code':
|
||
# Update: #| lst-cap: old -> #| lst-cap: "new" (with quotes)
|
||
# Handle both quoted and unquoted original captions
|
||
pattern = rf'(#\|\s*lst-cap:\s*)(?:["\']([^"\']*)["\']|([^\n]*))'
|
||
replacement = rf'\1"{new_caption}"'
|
||
|
||
# Apply within the specific code block
|
||
start_pos = instance['start']
|
||
end_pos = instance['end']
|
||
block_content = updated_content[start_pos:end_pos]
|
||
|
||
new_block_content = re.sub(pattern, replacement, block_content, count=1)
|
||
if new_block_content == block_content:
|
||
error_messages.append(f"Failed to update PDF code block for {item_id}")
|
||
return content, False, error_messages
|
||
|
||
# Replace the block in the full content
|
||
updated_content = updated_content[:start_pos] + new_block_content + updated_content[end_pos:]
|
||
|
||
except Exception as e:
|
||
error_messages.append(f"Error updating {instance['format']} instance: {e}")
|
||
return content, False, error_messages
|
||
|
||
return updated_content, True, []
|
||
|
||
# ================================================================
|
||
# ENHANCED LISTING DETECTION USING GENERIC SYSTEM
|
||
# ================================================================
|
||
|
||
def find_listing_definition_enhanced(self, content: str, lst_id: str) -> Optional[Dict[str, str]]:
|
||
"""
|
||
Enhanced listing detection that handles multiple instances.
|
||
|
||
Args:
|
||
content: QMD file content
|
||
lst_id: Listing ID to search for (e.g., 'lst-mlp_layer_matrix')
|
||
|
||
Returns:
|
||
Dict with canonical caption and instance information
|
||
"""
|
||
# Use the generic detection system
|
||
instances = self.detect_all_instances_by_id(content, lst_id, "listing")
|
||
|
||
if not instances:
|
||
# Fall back to original method for compatibility
|
||
return self.find_listing_definition_in_qmd(content, lst_id)
|
||
|
||
# Get canonical caption and validate consistency
|
||
canonical_caption, warnings = self.canonicalize_caption(instances)
|
||
|
||
if warnings:
|
||
print(f" ⚠️ {lst_id}: {'; '.join(warnings)}")
|
||
|
||
# Determine primary language (prefer from code blocks)
|
||
language = 'unknown'
|
||
for instance in instances:
|
||
if instance['format'] == 'pdf_code' and 'language' in instance:
|
||
language = instance['language']
|
||
break
|
||
|
||
return {
|
||
'type': 'listing',
|
||
'caption': canonical_caption,
|
||
'language': language,
|
||
'instances': instances, # Store for later atomic updates
|
||
'consistency_warnings': warnings,
|
||
'full_text': instances[0]['full_text'] if instances else ''
|
||
}
|
||
|
||
def clean_caption_artifacts(self, caption: str) -> str:
|
||
"""
|
||
Clean unwanted artifacts from captions.
|
||
|
||
Removes patterns like:
|
||
- *Source: ...*
|
||
- "via This code snippet"
|
||
- Other common artifacts
|
||
|
||
Args:
|
||
caption: Raw caption text
|
||
|
||
Returns:
|
||
Cleaned caption text
|
||
"""
|
||
if not caption:
|
||
return caption
|
||
|
||
# Remove *Source: ...* patterns
|
||
caption = re.sub(r'\*\s*Source:\s*[^*]*\*', '', caption, flags=re.IGNORECASE)
|
||
|
||
# Remove "via This code snippet" and similar artifacts
|
||
caption = re.sub(r'via\s+this\s+code\s+snippet\.?', '', caption, flags=re.IGNORECASE)
|
||
caption = re.sub(r'through\s+the\s+code\.?', '', caption, flags=re.IGNORECASE)
|
||
|
||
# Remove multiple consecutive spaces and trim
|
||
caption = re.sub(r'\s+', ' ', caption).strip()
|
||
|
||
# Remove trailing/leading periods that might be left over
|
||
caption = re.sub(r'^\.\s*|\s*\.$', '', caption).strip()
|
||
|
||
return caption
|
||
|
||
def canonicalize_caption(self, instances: List[Dict]) -> Tuple[str, List[str]]:
|
||
"""
|
||
Choose canonical caption from multiple instances and validate consistency.
|
||
Enhanced with artifact cleanup.
|
||
|
||
Args:
|
||
instances: List of instances from detect_all_instances_by_id
|
||
|
||
Returns:
|
||
Tuple of (canonical_caption, warnings_if_inconsistent)
|
||
"""
|
||
if not instances:
|
||
return "", ["No instances found"]
|
||
|
||
if len(instances) == 1:
|
||
cleaned_caption = self.clean_caption_artifacts(instances[0]['caption'])
|
||
return cleaned_caption, []
|
||
|
||
# Clean artifacts from all captions before comparison
|
||
cleaned_instances = []
|
||
for instance in instances:
|
||
cleaned_caption = self.clean_caption_artifacts(instance['caption'])
|
||
cleaned_instances.append({**instance, 'cleaned_caption': cleaned_caption})
|
||
|
||
# Collect all unique cleaned captions
|
||
captions = [inst['cleaned_caption'] for inst in cleaned_instances if inst['cleaned_caption']]
|
||
unique_captions = list(set(captions))
|
||
|
||
warnings = []
|
||
|
||
if len(unique_captions) > 1:
|
||
warnings.append(f"Inconsistent captions found: {unique_captions}")
|
||
|
||
# Preference logic: HTML callout > PDF code block (using cleaned captions)
|
||
for instance in cleaned_instances:
|
||
if instance['format'] == 'html_callout' and instance['cleaned_caption']:
|
||
canonical = instance['cleaned_caption']
|
||
if len(unique_captions) > 1:
|
||
warnings.append(f"Using HTML callout caption: '{canonical}'")
|
||
return canonical, warnings
|
||
|
||
# Fallback to first non-empty cleaned caption
|
||
for caption in captions:
|
||
if caption:
|
||
if len(unique_captions) > 1:
|
||
warnings.append(f"Using first available caption: '{caption}'")
|
||
return caption, warnings
|
||
|
||
return "", ["No valid captions found"]
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Improve figure and table captions using local Ollama models",
|
||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
epilog="""
|
||
Examples:
|
||
# Improve captions with LLM (recommended default):
|
||
python improve_figure_captions.py -d contents/vol1/
|
||
python improve_figure_captions.py --improve -d contents/vol1/
|
||
|
||
# Using different models:
|
||
python improve_figure_captions.py -d contents/vol1/ --model llama3.2:3b
|
||
python improve_figure_captions.py -i -d contents/vol1/ -m qwen2.5:14b
|
||
python improve_figure_captions.py -d contents/vol1/ --model mistral:7b
|
||
|
||
# Content filtering:
|
||
python improve_figure_captions.py -d contents/vol1/ --figures-only
|
||
python improve_figure_captions.py -d contents/vol1/ -F # Short form
|
||
python improve_figure_captions.py -d contents/vol1/ --tables-only
|
||
python improve_figure_captions.py -d contents/vol1/ -T # Short form
|
||
python improve_figure_captions.py -d contents/vol1/ --listings-only
|
||
python improve_figure_captions.py -d contents/vol1/ -L # Short form
|
||
|
||
# Analysis and utilities:
|
||
python improve_figure_captions.py --build-map -d contents/vol1/
|
||
python improve_figure_captions.py -b -d contents/vol1/
|
||
python improve_figure_captions.py --analyze -d contents/vol1/
|
||
python improve_figure_captions.py --repair -d contents/vol1/
|
||
|
||
# Content filtering with other modes:
|
||
python improve_figure_captions.py --analyze -d contents/vol1/ --figures-only
|
||
python improve_figure_captions.py --repair -d contents/vol1/ --tables-only
|
||
python improve_figure_captions.py --build-map -d contents/vol1/ --listings-only
|
||
python improve_figure_captions.py --build-map -d contents/vol1/ -F
|
||
|
||
# Multiple directories:
|
||
python improve_figure_captions.py -d contents/vol1/ -d contents/frontmatter/ -m llama3.2:3b
|
||
|
||
# Save detailed JSON output:
|
||
python improve_figure_captions.py -d contents/vol1/ --save-json
|
||
"""
|
||
)
|
||
|
||
# Multiple file/directory input
|
||
parser.add_argument('-f', '--files', action='append',
|
||
help='Process specific QMD files (can be used multiple times)')
|
||
parser.add_argument('-d', '--directories', action='append',
|
||
help='Process directories recursively for .qmd files (can be used multiple times)')
|
||
|
||
# Mode selection
|
||
group = parser.add_mutually_exclusive_group()
|
||
group.add_argument('--build-map', '-b', action='store_true',
|
||
help='Build content map from QMD files and save to JSON')
|
||
group.add_argument('--analyze', '-a', action='store_true',
|
||
help='Analyze caption quality and validate file structure')
|
||
group.add_argument('--repair', '-r', action='store_true',
|
||
help='Repair caption formatting issues only')
|
||
group.add_argument('--improve', '-i', action='store_true',
|
||
help='Improve captions using LLM and update files (default mode)')
|
||
|
||
# Content type filtering
|
||
content_group = parser.add_mutually_exclusive_group()
|
||
content_group.add_argument('--figures-only', '-F', action='store_true',
|
||
help='Process only figures (ignore tables and listings)')
|
||
content_group.add_argument('--tables-only', '-T', action='store_true',
|
||
help='Process only tables (ignore figures and listings)')
|
||
content_group.add_argument('--listings-only', '-L', action='store_true',
|
||
help='Process only code listings (ignore figures and tables)')
|
||
|
||
# Model options
|
||
parser.add_argument('--model', '-m', default='qwen2.5:7b',
|
||
help='Ollama model to use (default: qwen2.5:7b)')
|
||
parser.add_argument('--list-models', action='store_true',
|
||
help='List available Ollama models and exit')
|
||
|
||
# Output options
|
||
parser.add_argument('--save-json', action='store_true',
|
||
help='Save detailed content map to JSON file')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Handle --list-models flag
|
||
if args.list_models:
|
||
improver = FigureCaptionImprover()
|
||
success = improver.list_available_models()
|
||
return 0 if success else 1
|
||
|
||
# Validate that we have input files/directories for other operations
|
||
if not args.files and not args.directories:
|
||
print("❌ Error: --files or --directories required")
|
||
return 1
|
||
|
||
# Determine which files/directories to process
|
||
directories = []
|
||
specific_files = []
|
||
|
||
if args.directories:
|
||
directories.extend(args.directories)
|
||
if args.files:
|
||
# Store specific files for targeted processing
|
||
specific_files.extend(args.files)
|
||
# Validate that files exist
|
||
for file in args.files:
|
||
if not Path(file).exists():
|
||
print(f"❌ File not found: {file}")
|
||
return 1
|
||
if not file.endswith('.qmd'):
|
||
print(f"❌ Not a QMD file: {file}")
|
||
return 1
|
||
|
||
# Initialize improver with specified model
|
||
improver = FigureCaptionImprover(model_name=args.model)
|
||
|
||
# Check Ollama and model availability before proceeding
|
||
if not improver.check_ollama_and_model(args.model):
|
||
print(f"❌ Cannot proceed without properly configured Ollama and model {args.model}")
|
||
return 1
|
||
|
||
try:
|
||
if args.build_map:
|
||
# Build content map and save to JSON
|
||
print("🔍 Building content map from QMD files...")
|
||
content_map = improver.build_content_map_from_qmd(directories,
|
||
figures_only=args.figures_only,
|
||
tables_only=args.tables_only,
|
||
listings_only=args.listings_only,
|
||
specific_files=specific_files)
|
||
if content_map:
|
||
print("✅ Content map building completed!")
|
||
|
||
# Always save JSON for --build-map
|
||
improver.save_content_map(content_map)
|
||
|
||
# Show extraction report
|
||
stats = content_map['metadata']['extraction_stats']
|
||
if stats['extraction_failures'] == 0:
|
||
print("🎉 Perfect extraction! All figures and tables successfully processed.")
|
||
else:
|
||
print(f"⚠️ {stats['extraction_failures']} extraction failures detected.")
|
||
print("💡 Consider reviewing the files with issues for manual fixes.")
|
||
|
||
# Show brief summary
|
||
print(f"\n📋 CONTENT SUMMARY:")
|
||
print(f" 📊 Figures: {stats['figures_found']} total")
|
||
print(f" • Markdown: {stats['markdown_figures']}")
|
||
print(f" • TikZ: {stats['tikz_figures']}")
|
||
print(f" • R: {stats['r_figures']}")
|
||
print(f" • Code: {stats['code_figures']}")
|
||
print(f" 📋 Tables: {stats['tables_found']} total")
|
||
print(f" 📁 Files processed: {content_map['metadata']['qmd_files_scanned']}")
|
||
|
||
print(f"\n💾 Content map saved to: content_map.json")
|
||
print(f"📄 You can now review the complete JSON structure!")
|
||
|
||
else:
|
||
print("❌ Content map building failed!")
|
||
return 1
|
||
|
||
elif args.analyze:
|
||
# Analyze caption quality and validate file structure
|
||
print("🔍 Analyzing caption quality and file structure...")
|
||
|
||
# Build content map for validation
|
||
content_map = improver.build_content_map_from_qmd(directories,
|
||
figures_only=args.figures_only,
|
||
tables_only=args.tables_only,
|
||
listings_only=args.listings_only,
|
||
specific_files=specific_files)
|
||
if not content_map:
|
||
print("❌ Failed to build content map for analysis")
|
||
return 1
|
||
|
||
# Check caption quality
|
||
improver.check_caption_quality(directories,
|
||
figures_only=args.figures_only,
|
||
tables_only=args.tables_only,
|
||
listings_only=args.listings_only,
|
||
content_map=content_map)
|
||
|
||
# Validate QMD mapping
|
||
improver.validate_qmd_mapping(directories, content_map)
|
||
|
||
print("✅ Analysis completed!")
|
||
|
||
elif args.repair:
|
||
# Repair caption formatting issues only
|
||
print("🔧 Repairing caption formatting issues...")
|
||
content_map = improver.repair_captions(directories,
|
||
figures_only=args.figures_only,
|
||
tables_only=args.tables_only,
|
||
listings_only=args.listings_only,
|
||
specific_files=specific_files)
|
||
if content_map and args.save_json:
|
||
improver.save_content_map(content_map)
|
||
print("💾 Repaired content map saved to content_map.json")
|
||
print("✅ Caption repair completed!")
|
||
|
||
elif args.improve:
|
||
# LLM caption improvement mode (explicit)
|
||
print("🚀 Improving captions with LLM...")
|
||
improved_content_map = improver.complete_caption_improvement_workflow(directories, args.save_json,
|
||
figures_only=args.figures_only,
|
||
tables_only=args.tables_only,
|
||
listings_only=args.listings_only)
|
||
if not improved_content_map:
|
||
return 1
|
||
|
||
else:
|
||
# Default: Same as --improve (LLM improvement)
|
||
print("🚀 Improving captions with LLM (default mode)...")
|
||
improved_content_map = improver.complete_caption_improvement_workflow(directories, args.save_json,
|
||
figures_only=args.figures_only,
|
||
tables_only=args.tables_only,
|
||
listings_only=args.listings_only)
|
||
if not improved_content_map:
|
||
return 1
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n⚠️ Operation cancelled by user")
|
||
return 1
|
||
except Exception as e:
|
||
print(f"❌ Error: {e}")
|
||
return 1
|
||
|
||
return 0
|
||
|
||
if __name__ == "__main__":
|
||
main()
|